<a href="https://colab.research.google.com/github/Keerthana8888/CognoRise_Hackathon/blob/main/Code%20File/CognoRise_Hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Approach To The Problem

1. Load the data and libraries
2. Data Preparation and Data Transformation
3. Data Visualization
4. Exploratory Data Analysis
      * Uni Variate  
      * Bi- Variate
      * Multi- Variate
         * Logistic Regression
         * Decision Tree Classifier
         * Random Forest Classifier
         * AdaBoosting Classifier
         * Gradient Boosting Classifier
         * XGBoosting Classifier
         * KNearestNeighbors Classifier
         * Support Vector Machine Classifier
         * Navie Bayes Classifier
5. Evaluate the model
6. Apply the model to test data
7. Evaluate results using the submission data


# Load The Libraries And Data

In [None]:
#Standard Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Models Selection
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier , GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE
from scipy.stats import chi2_contingency
# Evaluators
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix
# PCA Libraries
from sklearn.decomposition import PCA

In [None]:
train = pd.read_csv("/content/C&T train dataset.csv")
test  = pd.read_csv("/content/C&T test dataset.csv")
sample_submission = pd.read_csv("/content/C&T Bank_sample_submission.csv")

In [None]:
train.info()

In [None]:
train.head()

# Data Preparation

In [None]:
train.isnull().sum()

In [None]:
def data_prep(data):
  # Deleting unwanted columns
  data.drop(['sno'] , axis = 1 , inplace = True)

  #Imputing Missing Values
  data['employment_st'] = np.where(data['employment_st'].isna() , data['employment_st'].mode(dropna = True) , data['employment_st'])
  data['poi'] = np.where(data['poi'].isna() , data['poi'].median(skipna = True) , data['poi'])
  data['gurantors'] = np.where(data['gurantors'].isna() , data['gurantors'].mode(dropna = True) , data['gurantors'])
  data['age'] = np.where(data['age'].isna() , data['age'].median(skipna = True) , data['age'])
  data['housing_type'] = np.where(data['housing_type'].isna() , data['housing_type'].mode(dropna = True) , data['housing_type'])

In [None]:
data_prep(train)

In [None]:
train.info()

# Encoding

In [None]:
def data_encode(data):
  le =LabelEncoder()

  for column in data.columns:
      if data[column].dtype=='object':
          data[column] = le.fit_transform(data[column])

In [None]:
data_encode(train)

In [None]:
train.info()

In [None]:
train.head()

# Exploratory Data Analysis

In [None]:
train.describe()

## Uni-Varient

In [None]:
train.skew()

In [None]:
for i in train.columns:
  sns.displot(train[i], color='#014F86' , kde = True)
  plt.title(f"Displot for {i}")

In [None]:
for i in train.columns:
  if i!='Group_no':
    sns.boxplot(data = train, x = train[i], hue = train['Group_no'], palette = 'mako')
    plt.title(f"boxplot for {i}")
    plt.show()

In [None]:
sns.pairplot(train)

In [None]:
for i in train.columns:
  if i!='Group_no':
    sns.kdeplot(data = train , x = train[i] , hue = train['Group_no'] , multiple = 'stack' , palette = 'mako')
    plt.title(f"KDE Plot for {i}")
    plt.show()

In [None]:
for i in train.columns:
  if i!='Group_no':
    sns.catplot(data = train , x = train[i] , y = range(0 , 800) , hue = train['Group_no'] , palette = 'viridis')
    plt.title(f"Cat Plot for {i}")
    plt.show()

## Bi-Varient

In [None]:
plt.figure(figsize = (12,12))
sns.heatmap(round(train.corr(),2) , annot = True , cmap = 'crest')

# Splitting Of Dataset

In [None]:
x = train.drop(['Group_no'], axis=1)
y = train['Group_no']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 43)

# Model Evaluation

In [None]:
def model_eval(actual , predicted):
  conf_matrix = confusion_matrix(actual , predicted)
  clas_rep = classification_report(actual , predicted)
  acc_score = accuracy_score(actual , predicted)
  print("The Accuracy of the Model is:" , round(acc_score , 2))
  print(conf_matrix)
  print(clas_rep)

# Multi-Varient

## Logistic Regression

In [None]:
lr = LogisticRegression(max_iter = 10000)
lr.fit(x_train,y_train)

In [None]:
y_hat_train_lr = lr.predict(x_train)
y_hat_test_lr = lr.predict(x_test)

In [None]:
model_eval (y_train, y_hat_train_lr)

In [None]:
model_eval (y_test, y_hat_test_lr)

## Decision Tree

In [None]:
dtree = DecisionTreeClassifier(max_depth = 4)
dtree.fit(x_train, y_train)

In [None]:
y_hat_train_dtree = dtree.predict(x_train)
y_hat_test_dtree = dtree.predict(x_test)

In [None]:
model_eval (y_train, y_hat_train_dtree)

In [None]:
model_eval (y_test, y_hat_test_dtree)

In [None]:
plt.figure()
clf = DecisionTreeRegressor(max_depth=4).fit(x_train, y_train)
plot_tree(clf, filled=True)
plt.title("Decision tree train Dataset")
plt.show()

## Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators = 200, max_depth = 4)
rf.fit(x_train, y_train)

In [None]:
y_hat_train_rf = rf.predict(x_train)
y_hat_test_rf = rf.predict(x_test)

In [None]:
model_eval (y_train, y_hat_train_rf)

In [None]:
model_eval (y_test, y_hat_test_rf)

## Ada Boost

In [None]:
ada = AdaBoostClassifier(n_estimators = 200)
ada.fit(x_train, y_train)

In [None]:
y_hat_train_ada = ada.predict(x_train)
y_hat_test_ada = ada.predict(x_test)

In [None]:
model_eval (y_train, y_hat_train_ada)

In [None]:
model_eval (y_test, y_hat_test_ada)

## Gradient Boost

In [None]:
gb = GradientBoostingClassifier(n_estimators = 200, max_depth = 3)
gb.fit(x_train, y_train)

In [None]:
y_hat_train_gb = gb.predict(x_train)
y_hat_test_gb = gb.predict(x_test)

In [None]:
model_eval (y_train, y_hat_train_gb)

In [None]:
model_eval (y_test, y_hat_test_gb)

## XGBoosting

In [None]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.fit_transform(y_test)

In [None]:
xgb = XGBClassifier(max_depth = 4, gamma = 6)
xgb.fit(x_train , y_train_enc)

In [None]:
y_hat_train_xgb = xgb.predict(x_train)
y_hat_test_xgb = xgb.predict(x_test)

In [None]:
model_eval(y_train_enc , y_hat_train_xgb)

In [None]:
model_eval(y_test_enc , y_hat_test_xgb)

## Navie Bayes

In [None]:
nb = GaussianNB()
nb.fit(x_train, y_train)

In [None]:
y_hat_train_nb = nb.predict(x_train)
y_hat_test_nb = nb.predict(x_test)

In [None]:
model_eval(y_train,y_hat_train_nb)

In [None]:
model_eval(y_test,y_hat_test_nb)

## KNearestNeighbors (KNN)

In [None]:
acc_list = []
for i in range(1,100):
  knn2 = KNeighborsClassifier(n_neighbors = i)
  knn2.fit(x_train , y_train)
  y_hat_test_knn2 = knn2.predict(x_test)
  acc_list.append(round(accuracy_score(y_test , y_hat_test_knn2) , 2))
print(acc_list)

In [None]:
x = np.array([i for i in range(1,100)])
y = acc_list
plt.figure(figsize=(15, 10))
plt.plot(x, y, marker='o', linestyle='-')
# Add annotations
for i, (xi, yi) in enumerate(zip(x, y)):
    plt.annotate(f'({xi}, {yi})', (xi, yi), textcoords="offset points", xytext=(0, 10), ha='center' , rotation = 90)
plt.xlabel("No of K")
plt.ylabel("Accuracy Score")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors = 34)
knn.fit(x_train , y_train )

In [None]:
y_hat_train_knn = knn.predict(x_train)
y_hat_test_knn = knn.predict(x_test)

In [None]:
model_eval(y_train , y_hat_train_knn)

In [None]:
model_eval(y_test , y_hat_test_knn)

## Support Vector Machine

In [None]:
svm = SVC(C = 5 , kernel = 'poly' , degree = 3)
svm.fit(x_train , y_train)

In [None]:
y_hat_train_svm = svm.predict(x_train)
y_hat_test_svm = svm.predict(x_test)

In [None]:
model_eval(y_train , y_hat_train_svm)

In [None]:
model_eval(y_test , y_hat_test_svm)

# Model Selection

In [None]:
accuracy_table=[['Logistic Regression',round(accuracy_score(y_test , y_hat_test_lr),2)],
  ['Decision Tree', round(accuracy_score(y_test , y_hat_test_dtree),2)],
  ['Random Forest' , round(accuracy_score(y_test , y_hat_test_rf),2)],
  ['Ada Boosting' , round(accuracy_score(y_test , y_hat_test_ada),2)],
  ['Gradient Boosting' , round(accuracy_score(y_test , y_hat_test_gb),2)],
  ['KNN' , round(accuracy_score(y_test , y_hat_test_knn),2)],
  ['SVM' , round(accuracy_score(y_test , y_hat_test_svm),2)]]
df1 = pd.DataFrame(accuracy_table, columns = ['Model','Test_Accuracy'])
print(df1)

***After evaluating various models, We get to known that the Random Forest Classifier outperforms the others, yielding the most favorable results during testing.***

# Application On Test Dataset

In [None]:
test.info()

In [None]:
test.head()

## Data Preparation

In [None]:
data_prep(test)

In [None]:
test.info()

## Encoding

In [None]:
data_encode(test)

In [None]:
test.info()

## Prediction

In [None]:
test['Group_no'] = rf.predict(test)

In [None]:
test['Group_no']

# Submission File

In [None]:
sample_submission.head()

In [None]:
sample_submission['Group_no'] = test['Group_no']

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv('Final Submission.csv' , index = False)