In [4]:
import pandas as pd
#import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report


ModuleNotFoundError: No module named 'plotly'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Dataset/titanic_train.csv')
df.head()

Lets first check for missing **values**

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cmap='viridis')

What we observe is we have a lot of missing values in Age and Cabin column and 1 in Embarked column

In [None]:
df.drop(['Cabin','Ticket'],axis=1,inplace=True)

In [None]:
df = df.dropna(subset="Embarked",axis=0)
df.head()

In [None]:
df['Embarked'].unique()

In [None]:
df.isnull().sum()

# **Filling the missing values**


> We can take average w.r.t to any of the factors but we'll selelect PClass because its diveided into 3 categories so we'll get a more accurate answer




In [None]:
fig = px.box(x='Pclass',y='Age',data_frame = df)
fig.update_layout(width=700,height=400)
fig.show()

In [None]:

def English(str):
    st = str.split(", ")
    return st[1].split(". ")[0]

df['Title'] = df['Name'].apply(lambda x : English(x))
df.head()


In [None]:
def impute_age(df):
  A = df['Age']
  Pclass = df['Pclass']

  if pd.isnull(A):

      if Pclass==1:
        return 37

      elif Pclass==2:
        return 29

      else:
        return 24
  else:
    return A

In [None]:
df['Age'] = df.apply(impute_age,axis=1)
df.head()

# **Visualizing the Data**

Cabin and Ticket is not a very Important information for calculating he probability of suurvived or not survived so we can just drop that

Now we have to fill the missing values in age

In [None]:
sns.countplot(x='Survived',hue='Sex',data=df,palette='coolwarm')

In [None]:
sns.countplot(x='Survived',hue='Pclass',data=df,palette='coolwarm')

In [None]:
df['Age'].hist(bins=40)
plt.title('Age Distribution');

In [None]:
sns.heatmap(df.corr(), annot=True)
plt.title('Corelation Matrix');

In [None]:
sns.boxplot(y='Age',x='Sex',data=df,palette='rainbow')


**No. of people who survived in the ship w.r.t their Fare**

In [None]:
survived_fares = df[df['Survived'] == 1]['Fare'].dropna()
g = sns.displot(survived_fares, kde=False, color='darkgreen', bins=20)


Now group by

In [None]:
df.head()

#**Question : How many men died on the ship who were in Pclass-3**

In [None]:
df[(df['Sex']=='male') & (df['Pclass'] == 3) & (df['Survived'] == 0)]['Name'].count()

In [None]:
df['Sex'].info()

#**Feature Engineering**

In [None]:
df2=df

We create a new Coloumn to stor the total size of people in one family

In [None]:
df2['familySize'] = df2['SibSp'] + df['Parch'] + 1

In [None]:
df2.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoded_columns = pd.get_dummies(df2['Embarked'], columns=['Embarked'])
df2 = pd.concat([df2, encoded_columns], axis=1)
df2.drop(columns='Embarked', inplace=True)


In [None]:
encoded_columns1 = pd.get_dummies(df2['Sex'], columns=['Sex'])
df2 = pd.concat([df2, encoded_columns1], axis=1)
df2.drop(columns='Sex', inplace=True)

In [None]:
# Lets drop the columns we don't need for our ML Model
df2 = df2.drop(['SibSp', 'Parch','Name','female','Title'], axis = 1)

In [None]:
df2.head()

In [None]:
df2.dropna(subset=['S','Q','C'],axis=0)

In [None]:
df.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split
X = df2.drop(['Survived'],axis=1)
y = df2['Survived']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

# Scaling the Data

> Here we're using MinMax() Scaler because we don't need to preserve the units
We just want to keep the relation between two features as accurate as possible



In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_s = scaler.fit_transform(X_train)

X_test_s = scaler.transform(X_test)

#Applying Machine Learning Models



## Logistic Regression



In [None]:
from tables.file import parameters
param_grid={
 'class_weight' : [None,'balanced'],
  'penalty': ['l1', 'l2', 'elasticnet', None],
  'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag'],
  'max_iter': [100,200,300]
}

In [None]:
from sklearn.linear_model import LogisticRegression

model=LogisticRegression()
cv=GridSearchCV(model,param_grid,cv=5,scoring='accuracy')
cv.fit(X_train_s,y_train)

In [None]:
y_pred=cv.predict(X_test_s)
logistic_score=accuracy_score(y_pred,y_test)
logistic_score

In [None]:
cv.best_params_

## K Nearest neighbours

> First we'll find the best value for k through a line plot



In [None]:
error_rate = []

# Will take some time
for i in range(1,60):

    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train_s,y_train)
    pred_i = knn.predict(X_test_s)
    error_rate.append(np.mean(pred_i != y_test))  # To avoid error rate becoming zero

In [None]:
fig = px.line(x=range(1, 60), y=error_rate,
              title='Error Rate vs. K Value',
              labels={'x': 'K', 'y': 'Error Rate'})
fig.update_traces(line=dict(color='blue', dash='dash'))
fig.update_layout(width=800, height=600)
fig.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knnclf = KNeighborsClassifier(n_neighbors=31)

# Train the model using the training sets
knnclf.fit(X_train_s, y_train)
y_pred = knnclf.predict(X_test_s)

knn_score=accuracy_score(y_pred,y_test)
knn_score



 ## Decision Trees and Random Forests





In [None]:
param_grid={
 'criterion':['gini','entropy','log_loss'],
  'splitter':['best','random'],
  'max_depth':[1,2,3,4,5],
  'max_features':['sqrt', 'log2']
}

In [None]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
cv=GridSearchCV(model,param_grid,cv=5,scoring='accuracy')
cv.fit(X_train,y_train)
y_pred=cv.predict(X_test)

In [None]:
Decsion_Tree_score=accuracy_score(y_pred,y_test)
Decsion_Tree_score

In [None]:
# Parameters that were used
cv.best_params_

In [None]:
error_rate = []
value=[30,50,100,150,200,250,300,350,400,450]
# Will take some time
for i in value:

    rfc = RandomForestClassifier(n_estimators=i)
    rfc.fit(X_train,y_train)
    pred_i = rfc.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

fig = px.line(x=value, y=error_rate,
              title='Error Rate vs. n value',
              labels={'x': 'K', 'y': 'Error Rate'})
fig.update_traces(line=dict(color='blue', dash='dash'))
fig.update_layout(width=800, height=600)
fig.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)    # We did not take 350 beacause it will overfit our Model
cv=GridSearchCV(model,param_grid,cv=5,scoring='accuracy')
cv.fit(X_train,y_train)
y_pred=cv.predict(X_test)

rf_score=accuracy_score(y_pred,y_test)
rf_score



> ## Support Vector Machine



In [None]:
param_grid = {'C': [0.1,1, 10, 100, 1000],
              'gamma': ['scale', 'auto'] ,
              'kernel': ['rbf','sigmoid','poly','linear']}

In [None]:
from sklearn.svm import SVC

model = SVC()
cv = GridSearchCV(model,param_grid,cv=5,scoring='accuracy')
model.fit(X_train_s,y_train)

In [None]:
y_pred = model.predict(X_test_s)
svc_score = accuracy_score(y_pred,y_test)
svc_score