In [195]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [196]:
# Create a DataFrame with 10 columns and a target, one column with missing values and 2 columns for encoding, 500 rows
np.random.seed(42)  # For reproducibility

# Generate random data
data = np.random.randn(1000, 6)

# Create a DataFrame
df = pd.DataFrame(data, columns=[f'feature_{i}' for i in range(6)])

# Add a target column


# Introduce missing values in one column
df.loc[np.random.choice(df.index, size=50, replace=False), 'feature_0'] = np.nan

# Add two columns for encoding
df['category_1'] = np.random.choice(['A', 'B', 'C'], size=1000)
df['category_2'] = np.random.choice(['X', 'Y', 'Z'], size=1000)

df['target'] = np.random.randint(0, 2, size=1000)

In [197]:
df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,category_1,category_2,target
0,0.496714,-0.138264,0.647689,1.52303,-0.234153,-0.234137,B,Z,1
1,1.579213,0.767435,-0.469474,0.54256,-0.463418,-0.46573,B,Y,1
2,0.241962,-1.91328,-1.724918,-0.562288,-1.012831,0.314247,C,X,0
3,-0.908024,-1.412304,1.465649,-0.225776,0.067528,-1.424748,A,X,0
4,-0.544383,0.110923,-1.150994,0.375698,-0.600639,-0.291694,A,Z,1


Task1: Pre-processing like missing value imputation or encoding

In [198]:
df.isnull().sum()

feature_0     50
feature_1      0
feature_2      0
feature_3      0
feature_4      0
feature_5      0
category_1     0
category_2     0
target         0
dtype: int64

In [199]:
mean = df['feature_0'].mean()
median = df['feature_0'].median()
mode = df['feature_0'].mode()[0]
print(f'Mean: {mean}, Median: {median}, Mode: {mode}')

Mean: 0.01253227753702049, Median: 0.01142597087531547, Mode: -2.6357477390168778


In [200]:
df['feature_0'] = df['feature_0'].fillna(mean)
df.isnull().sum()

feature_0     0
feature_1     0
feature_2     0
feature_3     0
feature_4     0
feature_5     0
category_1    0
category_2    0
target        0
dtype: int64

In [201]:
df['category_1'].value_counts()

category_1
A    345
C    342
B    313
Name: count, dtype: int64

In [202]:
df['category_2'].value_counts()

category_2
Y    344
X    336
Z    320
Name: count, dtype: int64

In [203]:
# Create dictionaries for encoding
category_1_mapping = {'A': 0, 'B': 1, 'C': 2}

# Apply the mappings
df['category_1'] = df['category_1'].map(category_1_mapping)
df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,category_1,category_2,target
0,0.496714,-0.138264,0.647689,1.52303,-0.234153,-0.234137,1,Z,1
1,1.579213,0.767435,-0.469474,0.54256,-0.463418,-0.46573,1,Y,1
2,0.241962,-1.91328,-1.724918,-0.562288,-1.012831,0.314247,2,X,0
3,-0.908024,-1.412304,1.465649,-0.225776,0.067528,-1.424748,0,X,0
4,-0.544383,0.110923,-1.150994,0.375698,-0.600639,-0.291694,0,Z,1


In [204]:
df = pd.get_dummies(df, columns=['category_2'], prefix='category_2')
df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,category_1,target,category_2_X,category_2_Y,category_2_Z
0,0.496714,-0.138264,0.647689,1.52303,-0.234153,-0.234137,1,1,False,False,True
1,1.579213,0.767435,-0.469474,0.54256,-0.463418,-0.46573,1,1,False,True,False
2,0.241962,-1.91328,-1.724918,-0.562288,-1.012831,0.314247,2,0,True,False,False
3,-0.908024,-1.412304,1.465649,-0.225776,0.067528,-1.424748,0,0,True,False,False
4,-0.544383,0.110923,-1.150994,0.375698,-0.600639,-0.291694,0,1,False,False,True


Task2: Pre-processing like Scaling

In [205]:
X = df.drop('target', axis=1)
y = df['target']

In [206]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Task3: Model building

In [207]:
# splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [208]:
from sklearn.tree import DecisionTreeClassifier

lr = DecisionTreeClassifier()
lr.fit(X_train, y_train)

Task4: Model evaluation

In [209]:

# Predict on the test set
y_pred = lr.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix: \n{cm}')

Accuracy: 0.535
Precision: 0.6145833333333334
Recall: 0.5130434782608696
F1 Score: 0.5592417061611374
Confusion Matrix: 
[[48 37]
 [56 59]]


Task5: Model exploration for differnet parameter

In [210]:


# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_}')

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best Score: 0.5225
