In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb


In [None]:
df=pd.read_csv('train.csv')
df.head()
df.isnull().sum()
df=df.drop(['Name','PassengerId','Parch','SibSp','Cabin','Ticket','Embarked'],axis=1)
print(df.head())

   Survived  Pclass     Sex   Age     Fare
0         0       3    male  22.0   7.2500
1         1       1  female  38.0  71.2833
2         1       3  female  26.0   7.9250
3         1       1  female  35.0  53.1000
4         0       3    male  35.0   8.0500


In [None]:
#We can use K fold to get the best model but we have not implemeanted it over here

In [None]:
from sklearn.preprocessing import LabelEncoder
abc_Sex=LabelEncoder()
df['Sex_mno']=abc_Sex.fit_transform(df['Sex'])
df['Sex']=df['Sex_mno']
df=df.drop(['Sex_mno'],axis=1)
print(df)

     Survived  Pclass  Sex   Age     Fare
0           0       3    1  22.0   7.2500
1           1       1    0  38.0  71.2833
2           1       3    0  26.0   7.9250
3           1       1    0  35.0  53.1000
4           0       3    1  35.0   8.0500
..        ...     ...  ...   ...      ...
886         0       2    1  27.0  13.0000
887         1       1    0  19.0  30.0000
888         0       3    0   NaN  23.4500
889         1       1    1  26.0  30.0000
890         0       3    1  32.0   7.7500

[891 rows x 5 columns]


In [None]:
from sklearn.impute import KNNImputer
knn=KNNImputer(n_neighbors=3,add_indicator=True)
temp=df[['Age']]
knn.fit(temp)
df['Age']=knn.transform(temp)
df.isnull().sum()
print(df)

     Survived  Pclass  Sex        Age     Fare
0           0       3    1  22.000000   7.2500
1           1       1    0  38.000000  71.2833
2           1       3    0  26.000000   7.9250
3           1       1    0  35.000000  53.1000
4           0       3    1  35.000000   8.0500
..        ...     ...  ...        ...      ...
886         0       2    1  27.000000  13.0000
887         1       1    0  19.000000  30.0000
888         0       3    0  29.699118  23.4500
889         1       1    1  26.000000  30.0000
890         0       3    1  32.000000   7.7500

[891 rows x 5 columns]


In [None]:
#Applying MinMax normalization
from sklearn.preprocessing import MinMaxScaler
scalar=MinMaxScaler()
scalar.fit(df[['Age']])
df['Age_n']=scalar.transform(df[['Age']])
df['Age']=df['Age_n']
scalar.fit(df[['Fare']])
df['Fare_n']=scalar.transform(df[['Fare']])
df['Fare']=df["Fare_n"]
df=df.drop(['Fare_n'],axis=1)

df=df.drop(['Age_n'],axis=1)
print(df.head())


   Survived  Pclass  Sex       Age      Fare
0         0       3    1  0.271174  0.014151
1         1       1    0  0.472229  0.139136
2         1       3    0  0.321438  0.015469
3         1       1    0  0.434531  0.103644
4         0       3    1  0.434531  0.015713


In [None]:
target=df["Survived"]
df=df.drop(['Survived'],axis=1)

**Fixing the imbalance problem by Undersampling**


In [None]:
from sklearn.utils import resample
majority_class = target
minority_class = df

# Undersample majority class
majority_undersampled = resample(majority_class,
                                 replace=False,  # sample without replacement
                                 n_samples=len(minority_class),  # match minority class size
                                 random_state=123)  # reproducible results

# Combine minority class with undersampled majority class
undersampled  = pd.concat([majority_undersampled, minority_class])

**Fixing the imbalance problem by Oversampling or SMOTENC**


In [None]:
from imblearn.over_sampling import SMOTENC
import numpy as np

# Load the data
X = df
y = target

# Define the categorical features
categorical_features = [True, True, False, False]

# Create the SMOTENC object
smote = SMOTENC(categorical_features=categorical_features,random_state=123)

# Oversample the data
X_resampled, y_resampled = smote.fit_resample(X, y)

# Print the class distribution before and after oversampling
print(f'Before: {np.bincount(y)}')
print(f'After: {np.bincount(y_resampled)}')



Before: [549 342]
After: [549 549]


In [None]:
from sklearn.model_selection import train_test_split
X_train,x_test,Y_train,y_test=train_test_split(X_resampled,y_resampled,test_size=0.3)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.neighbors import (NeighborhoodComponentsAnalysis,KNeighborsClassifier)


**HYPER PARAMETER TUNING**

In [None]:
clf=GridSearchCV(svm.SVC(gamma='auto'),{'C':[1,5,10,20,30,50,100],'kernel':['rbf','linear']},cv=5,return_train_score=False)
clf.fit(X_train,Y_train)
df123=pd.DataFrame(clf.cv_results_)
print(df123[['param_C','param_kernel','mean_test_score']])

   param_C param_kernel  mean_test_score
0        1          rbf         0.774764
1        1       linear         0.778661
2        5          rbf         0.774781
3        5       linear         0.778661
4       10          rbf         0.787794
5       10       linear         0.778661
6       20          rbf         0.794313
7       20       linear         0.778661
8       30          rbf         0.796910
9       30       linear         0.778661
10      50          rbf         0.803429
11      50       linear         0.778661
12     100          rbf         0.804736
13     100       linear         0.778661


In [None]:
print(df123[['param_C','param_kernel','mean_test_score']])


   param_C param_kernel  mean_test_score
0        1          rbf         0.774764
1        1       linear         0.778661
2        5          rbf         0.774781
3        5       linear         0.778661
4       10          rbf         0.787794
5       10       linear         0.778661
6       20          rbf         0.794313
7       20       linear         0.778661
8       30          rbf         0.796910
9       30       linear         0.778661
10      50          rbf         0.803429
11      50       linear         0.778661
12     100          rbf         0.804736
13     100       linear         0.778661


**REGULARIZATION**

As the Best Parameters are at index 12, however the index 10 has the same score with a far less C value. The "C" parameter controls the strength of the regularization. A smaller value of C corresponds to stronger regularization, while a larger value of C corresponds to weaker regularization.

Hence we will use the parameters at index 12

**SVC**




In [None]:
model = svm.SVC(C=50,kernel="rbf")
model.fit(X_train,Y_train)
print(" Model Score:")
print(model.score(x_test,y_test))
predictions=model.predict(x_test)



xg_model = xgb.XGBClassifier(base_model=model, n_estimators=10)
xg_model.fit(X_train, Y_train)

xg_score = xg_model.score(x_test, y_test)
print("Boosted Model Score: {:.2f}".format(xg_score))


print("Confusion matrix")
cm = confusion_matrix(y_test, predictions)
print(cm)

 Model Score:
0.793939393939394
Boosted Model Score: 0.82
Confusion matrix
[[134  34]
 [ 34 128]]


**Results**

Here we have predicted the survivors of the titanic dataset with 82% on the test data. This means that we used optimal techniques to fix the imbalance dataset problem,normalizations, missing data and above all the Classification Model.



