# Imports

In [1]:
# Importing libraries

import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import sklearn
import lazypredict
from ydata_profiling import ProfileReport

ModuleNotFoundError: No module named 'lazypredict'

In [131]:
# Importing dataset

from google.colab import drive
drive.mount('/content/drive')

train_df = pd.read_csv("/content/drive/MyDrive/Datasets/titanic/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Datasets/titanic/test.csv")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [132]:
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [204]:
train_df.shape

(891, 12)

In [209]:
test_df.shape

(418, 11)

In [133]:
features = test_df.columns

In [134]:
df = pd.concat([train_df[features], test_df])

# Exploratory data analysis

## Automated EDA using ydata

In [135]:
report = ProfileReport(df, title = "report")

In [136]:
report

Output hidden; open in https://colab.research.google.com to view.

In [137]:
train_report = ProfileReport(train_df, title = "train_report")
train_report

Output hidden; open in https://colab.research.google.com to view.

## Exploratory data analysis


In [138]:
# Data points and features
print(f"The triaing data has {train_df.shape[1]} features and {train_df.shape[0]} rows")
print(f"The testing data has {test_df.shape[1]} features and {test_df.shape[0]} rows")

The triaing data has 12 features and 891 rows
The testing data has 11 features and 418 rows


In [139]:
# Duplicates

print(f"The training data has {train_df.duplicated().sum()} duplicate values")
print(f"The testing data has {test_df.duplicated().sum()} duplicate values")

The training data has 0 duplicate values
The testing data has 0 duplicate values


## Preprocessing

In [140]:
# Missing values

def missing_values(data):
  missed = data.isnull().sum()
  missed = missed[missed>0].reset_index()
  missed.columns = ["Features","count"]
  print(missed.to_string(index=False))

print("Missing values in training data")
missing_values(train_df)

print("\n")
print("Missing values in testing data")
missing_values(test_df)

print("\n")
print("Missing values in combined data")
missing_values(df)


Missing values in training data
Features  count
     Age    177
   Cabin    687
Embarked      2


Missing values in testing data
Features  count
     Age     86
    Fare      1
   Cabin    327


Missing values in combined data
Features  count
     Age    263
    Fare      1
   Cabin   1014
Embarked      2


### Filling missing values

In [141]:
# Im considering the missing age would be age Zero

df.fillna({"Age":0}, inplace =True)

In [142]:
# missing value in "Fare"

# Pclass of the missing datapoint
class_fare = df[df["Fare"].isnull()]["Pclass"].iloc[0]

# filtering datapoints with same pclass
class_3 = df[df["Pclass"] == class_fare]["Fare"]

# Average of the fare
avg = class_3.sum()/len(class_3)

df.fillna({"Fare":avg}, inplace = True)


In [143]:
# Embarked

print(df[df["Embarked"].isnull()])

temp = df[(df["Ticket"].str[:2]=="11") & (df["Cabin"].str[:1]=="B")]

print(temp["Embarked"].value_counts())

print(temp["Cabin"].value_counts())


# Filling "Embarked" with most probable value "S"


df.fillna({"Embarked": "S"}, inplace = True)

     PassengerId  Pclass                                       Name     Sex  \
61            62       1                        Icard, Miss. Amelie  female   
829          830       1  Stone, Mrs. George Nelson (Martha Evelyn)  female   

      Age  SibSp  Parch  Ticket  Fare Cabin Embarked  
61   38.0      0      0  113572  80.0   B28      NaN  
829  62.0      0      0  113572  80.0   B28      NaN  
Embarked
S    15
C     7
Name: count, dtype: int64
Cabin
B96 B98        4
B77            2
B49            2
B18            2
B28            2
B30            1
B102           1
B26            1
B11            1
B36            1
B42            1
B37            1
B38            1
B79            1
B94            1
B19            1
B52 B54 B56    1
Name: count, dtype: int64


In [144]:
# Cabin

df.drop(["Cabin"], axis = 1, inplace = True)


In [145]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0
Embarked,0


### Feature Engineering

In [191]:
# def analyse(row):
#   temp = row.split(" ")

#   if len(temp)>2:
#     print(row)

#   try:
#     temp1 = int(float(temp[-1]))
#   except:
#     print(f"{row}=={temp[-1]}")


In [192]:
# for row in df["Ticket"]:
#   analyse(row)

STON/O 2. 3101294
STON/O 2. 3101280
LINE==LINE
STON/O 2. 3101275
LINE==LINE
LINE==LINE
STON/O 2. 3101293
STON/O 2. 3101289
STON/O 2. 3101269
STON/O 2. 3101274
SC/AH Basle 541
STON/O 2. 3101286
STON/O 2. 3101273
LINE==LINE
STON/O 2. 3101292
STON/O 2. 3101285
STON/O 2. 3101288
STON/O 2. 3101291
STON/O 2. 3101268
A. 2. 39186


In [196]:
# Feature "Ticket"

def separator(ticket):
  """
  ticket: Ticket string

  output: [prefix, number] Three ways are followed depending length of the ticket.split(" ")

  """
  temp = ticket.split(" ")
  # Length = 1
  if len(temp)== 1:
    try:
      temp_int = int(float(temp[0]))

      # Embedding own prefix -->G + "starting number" + "Number of digits"
      str_temp = str(temp_int)

      prefix = f"G{str_temp[0]}{len(str_temp)}"
      temp = [prefix, temp_int]
    except:
      temp = [temp[0], 11111]
    return temp
  # Length = 2
  elif len(temp) == 2:
    return [temp[0], int(float(temp[1]))]

  # Length > 2
  elif len(temp) > 2:
    prefix = f"{temp[0]} {temp[1]}"
    number = int(float(temp[-1]))
    return [prefix, number]


In [197]:
df["Ticket_prefix"] = df["Ticket"].apply(lambda x: separator(x)[0])
df["Ticket_number"] = df["Ticket"].apply(lambda x: separator(x)[-1])

In [200]:
# Dropping the "Ticket"

df.drop(["Ticket"], axis =1, inplace = True)

### Encoding and Normalisation

In [146]:
df.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Embarked'],
      dtype='object')

In [150]:
# Feature PassengerID dropping

df.drop(["PassengerId"], axis=1, inplace = True)

In [152]:
# Pclass is already encoded and normalised



In [154]:
# Feature "Name" Dropping


df.drop(["Name"], axis =1, inplace = True)

In [158]:
# Feature "sex"

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["Sex"] = le.fit_transform(df["Sex"])


array([1, 0])

In [160]:
# Feature "Age" Normalisation

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df["Age"] = scaler.fit_transform(df[["Age"]])

In [162]:
# Feature "SibSp"

df["SibSp"].unique()

array([1, 0, 3, 4, 2, 5, 8])

In [163]:
# Feature "Parch"

df["Parch"].unique()

array([0, 1, 2, 5, 3, 4, 6, 9])

In [None]:
# Feature "Ticket_prefix" Encoding

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["Ticket_prefix"] = le.fit_transform(df["Ticket_prefix"])

In [None]:
# Feature "Ticket_number" Normalisation

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df["Ticket_number"] = scaler.fit_transform(df[["Ticket_number"]])

In [206]:
# Feature "Fare" Normalisation

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df["Fare"] = scaler.fit_transform(df[["Fare"]])

In [208]:
# Feature "Embarked" Encoding

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["Embarked"] = le.fit_transform(df["Embarked"])


In [213]:
# splitting data into train and test

## we havenot removed any rows so

train_set = df[:891]
test_set = df[891:]

print(f"Postprocessed train shape:{train_set.shape}, testing shape: {test_set.shape}")
print(f"Preprocessed trained shape:{train_df.shape}, testing shape: {test_df.shape}")

Postprocessed train shape:(891, 9), testing shape: (418, 9)
Preprocessed trained shape:(891, 12), testing shape: (418, 11)


In [214]:
train_set.iloc[40:50]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Ticket_prefix,Ticket_number
40,3,0,0.5,1,0,0.018494,2,35,0.002433
41,2,0,0.3375,1,0,0.040989,2,20,0.003762
42,3,1,0.0,0,0,0.015412,0,27,0.112614
43,2,0,0.0375,1,2,0.081157,0,56,0.000684
44,3,0,0.2375,0,0,0.015379,1,27,0.106715
45,3,1,0.0,0,0,0.015713,2,43,0.007598
46,3,1,0.0,1,0,0.030254,1,27,0.119423
47,3,0,0.0,0,0,0.015127,1,20,0.004614
48,3,1,0.0,2,0,0.042315,0,22,0.000858
49,3,0,0.225,1,0,0.034743,2,27,0.112609


In [215]:
train_df.iloc[40:50]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
40,41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40.0,1,0,7546,9.475,,S
41,42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann ...",female,27.0,1,0,11668,21.0,,S
42,43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C
43,44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3.0,1,2,SC/Paris 2123,41.5792,,C
44,45,1,3,"Devaney, Miss. Margaret Delia",female,19.0,0,0,330958,7.8792,,Q
45,46,0,3,"Rogers, Mr. William John",male,,0,0,S.C./A.4. 23567,8.05,,S
46,47,0,3,"Lennon, Mr. Denis",male,,1,0,370371,15.5,,Q
47,48,1,3,"O'Driscoll, Miss. Bridget",female,,0,0,14311,7.75,,Q
48,49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C
49,50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18.0,1,0,349237,17.8,,S


In [217]:
train_set["Survived"] = train_df["Survived"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set["Survived"] = train_df["Survived"]


In [218]:
train_set.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Ticket_prefix,Ticket_number,Survived
0,3,1,0.275,1,0,0.014151,2,5,0.006826,0
1,1,0,0.475,1,0,0.139136,0,41,0.005674,1
2,3,0,0.325,0,0,0.015469,2,63,0.999989,1
3,1,0,0.4375,1,0,0.103644,2,21,0.036694,1
4,3,1,0.4375,0,0,0.015713,2,27,0.120416,0


# Classification

## train_test_split

In [228]:
x_train = train_set.drop(["Survived"], axis =1)
y_train = train_set["Survived"]
x_test = test_set

In [229]:
classifiers = lazypredict.Supervised.CLASSIFIERS

In [230]:
classifiers

[('AdaBoostClassifier', sklearn.ensemble._weight_boosting.AdaBoostClassifier),
 ('BaggingClassifier', sklearn.ensemble._bagging.BaggingClassifier),
 ('BernoulliNB', sklearn.naive_bayes.BernoulliNB),
 ('CalibratedClassifierCV', sklearn.calibration.CalibratedClassifierCV),
 ('CategoricalNB', sklearn.naive_bayes.CategoricalNB),
 ('DecisionTreeClassifier', sklearn.tree._classes.DecisionTreeClassifier),
 ('DummyClassifier', sklearn.dummy.DummyClassifier),
 ('ExtraTreeClassifier', sklearn.tree._classes.ExtraTreeClassifier),
 ('ExtraTreesClassifier', sklearn.ensemble._forest.ExtraTreesClassifier),
 ('FixedThresholdClassifier',
  sklearn.model_selection._classification_threshold.FixedThresholdClassifier),
 ('GaussianNB', sklearn.naive_bayes.GaussianNB),
 ('KNeighborsClassifier',
  sklearn.neighbors._classification.KNeighborsClassifier),
 ('LabelPropagation',
  sklearn.semi_supervised._label_propagation.LabelPropagation),
 ('LabelSpreading', sklearn.semi_supervised._label_propagation.LabelSprea

In [235]:
for classifier in classifiers[:1]:
  temp = classifier[1]().fit(x_train, y_train)
  y_predict = temp.predict(test_set)
  predicitons = {"PassengerId":test_df["PassengerId"], "Survived":y_predict}
  predictions = pd.DataFrame(predicitons)
  predictions.to_csv(f"/content/drive/MyDrive/Datasets/titanic/{classifier[0]}_predictions.csv", index = False)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]


418

## Visualising Results

In [None]:
# Accuracy

line = px.line(data_frame = models, y = ["Accuracy"], markers=True)
line.update_xaxes(title = "Model", rangeslider_visible = False)
line.update_yaxes(title = "Accuracy")
line.update_traces(line_color="red")
line.update_layout(showlegend=True,
                   title = {
                       "text":"Accuracu vs model",
                       "y":0.95,
                       "x":0.5,
                       "xanchor":"center",
                       "yanchor":"top"
                   })
line.show()

In [None]:
# ROC and F1 Score

line = px.line(data_frame= models, y = ["ROC AUC", "F1 Score"], markers = True)
line.update_xaxes(title = "Model", rangeslider_visible = False)
line.update_yaxes(title = "ROC AUC Score")
line.update_layout(showlegend = True,
                   title= {
                       "text":"ROC AOC and F1 Score vs Model",
                       "x" :0.50,
                       "y":0.95,
                       "xanchor":"center",
                       "yanchor":"top"
                   })
line.show()

In [None]:
# Training Time

line = px.line(data_frame = models, y = ["Time Taken"], markers = True)
line.update_xaxes(title = "Model", rangeslider_visible = False)
line.update_yaxes(title = "Time Taken")
line.update_traces(line_color = "green")
line.update_layout(showlegend = True,
                   title = {
                       "text":"Training Time vs Model",
                       "x":0.50,
                       "y":0.95,
                       "xanchor":"center",
                       "yanchor":"top"
                   })
line.show()

In [None]:
models = pd.DataFrame(models).to_csv("/content/drive/MyDrive/Datasets/expedia-travel-dataset/models.csv")

In [None]:
predictions = pd.DataFrame(predictions).to_csv("/content/drive/MyDrive/Datasets/expedia-travel-dataset/predictions.csv")