In [None]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [None]:
from google.colab import files
files.upload()

In [None]:
import pandas as pd

file_path = '/content/Data.xlsx'

# Read the Excel file into a pandas DataFrame
df = pd.read_excel(file_path)

# Now you can work with the DataFrame 'df'
# For example, you can print the first few rows
print(df.head())


       Name  Age  Gender  Credit Score            Output
0    Aahana   29  Female           780   Very good (5L+)
1  Aaradhya   38  Female           720        Good (1L+)
2     Aarav   32    Male           825  Excellent (10L+)
3     Aarav   44    Male           550       Poor (10K+)
4     Aarna   31  Female           660       Fair (50K+)


In [None]:
df_without_name = df.drop(columns=['Name'])

In [None]:
df_encoded = pd.get_dummies(df_without_name, columns=['Gender'], drop_first=True)

In [None]:
df_encoded

Unnamed: 0,Age,Credit Score,Output,Gender_Male
0,29,780,Very good (5L+),0
1,38,720,Good (1L+),0
2,32,825,Excellent (10L+),1
3,44,550,Poor (10K+),1
4,31,660,Fair (50K+),0
...,...,...,...,...
499,35,730,Good (1L+),0
500,24,672,Good (1L+),1
501,39,820,Excellent (10L+),1
502,29,630,Fair (50K+),1


In [None]:
df_encoded.rename(columns={'Gender_Male': 'Gender'}, inplace=True)


In [None]:
df_encoded.head()

Unnamed: 0,Age,Credit Score,Output,Gender
0,29,780,Very good (5L+),0
1,38,720,Good (1L+),0
2,32,825,Excellent (10L+),1
3,44,550,Poor (10K+),1
4,31,660,Fair (50K+),0


In [None]:

output_mapping = {'Poor (10K+)': 0, 'Fair (50K+)': 1, 'Good (1L+)': 2, 'Very good (5L+)': 3, 'Excellent (10L+)': 4}
df_encoded['Output'] = df_encoded['Output'].map(output_mapping)


In [None]:
df_encoded.head()

Unnamed: 0,Age,Credit Score,Output,Gender
0,29,780,3,0
1,38,720,2,0
2,32,825,4,1
3,44,550,0,1
4,31,660,1,0


In [None]:
y = df_encoded['Output']
y

0      3
1      2
2      4
3      0
4      1
      ..
499    2
500    2
501    4
502    1
503    3
Name: Output, Length: 504, dtype: int64

In [None]:
X = df_encoded.drop(columns=['Output'])  # Input features


In [None]:
X

Unnamed: 0,Age,Credit Score,Gender
0,29,780,0
1,38,720,0
2,32,825,1
3,44,550,1
4,31,660,0
...,...,...,...
499,35,730,0
500,24,672,1
501,39,820,1
502,29,630,1


In [None]:
y

0      3
1      2
2      4
3      0
4      1
      ..
499    2
500    2
501    4
502    1
503    3
Name: Output, Length: 504, dtype: int64

In [None]:



# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# Training the Decision Tree model
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)


In [None]:

# Making predictions
y_pred = clf.predict(X_test)


In [None]:

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9603960396039604


In [None]:

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[15  0  0  0  0]
 [ 1 11  3  0  0]
 [ 0  0 29  0  0]
 [ 0  0  0 19  0]
 [ 0  0  0  0 23]]


In [None]:

print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        15
           1       1.00      0.73      0.85        15
           2       0.91      1.00      0.95        29
           3       1.00      1.00      1.00        19
           4       1.00      1.00      1.00        23

    accuracy                           0.96       101
   macro avg       0.97      0.95      0.95       101
weighted avg       0.96      0.96      0.96       101



In [None]:

#INCREASING ACCURACY
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}



In [None]:
# Initialize the Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42)



In [None]:
# Perform Grid Search Cross-Validation
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)




In [None]:
# Get the best parameters and model
best_params = grid_search.best_params_
best_clf = grid_search.best_estimator_


In [None]:

# Train the model with the best parameters
best_clf.fit(X_train, y_train)


In [None]:

# Make predictions
y_pred = best_clf.predict(X_test)



In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy after hyperparameter tuning:", accuracy)

print("\nBest Hyperparameters:", best_params)

Accuracy after hyperparameter tuning: 0.9603960396039604

Best Hyperparameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [None]:
#TESTING
sample_input = {
   "Age":44,
    "Credit Score":550,
   "Gender":1,
}
# The input should contain only above columns in same order. You have to preprocess the input before giving it to model...Female(0)..Male(1)
sample_df = pd.DataFrame([sample_input])
predicted_output = best_clf.predict(sample_df)
predicted_category = [key for key, value in output_mapping.items() if value == predicted_output[0]][0]
print("Predicted Output Category:", predicted_category)

Predicted Output Category: Poor (10K+)


In [None]:
import pickle

# Define the file path where you want to save the model
model_file_path = "model.pkl"

# Save the model to disk
with open(model_file_path, 'wb') as file:
    pickle.dump(best_clf, file)

print("Model saved successfully as:", model_file_path)


Model saved successfully as: model.pkl
