## Importing the Liberaries

In [26]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the Dataset

In [27]:
dataset = pd.read_csv('train.csv')

# Data Pre-Processing

## getting basic info about the dataset

In [28]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Name         891 non-null    object 
 2   Survived     891 non-null    int64  
 3   Pclass       891 non-null    int64  
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [29]:
dataset.head()

Unnamed: 0,PassengerId,Name,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,"Braund, Mr. Owen Harris",0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,1,female,35.0,1,0,113803,53.1,C123,S
4,5,"Allen, Mr. William Henry",0,3,male,35.0,0,0,373450,8.05,,S


## Checking how many unique values are there in each column

In [30]:
dataset.nunique()

PassengerId    891
Name           891
Survived         2
Pclass           3
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

## Taking Care of the Missing Values

In [31]:
## checking how many missing values are in each column

In [32]:
dataset.isna().sum()

PassengerId      0
Name             0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [33]:
## Using SimpleImputer to replace missing values in age column with the mean value.

In [34]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(dataset[['Age']])
dataset['Age'] = imputer.transform(dataset[['Age']]).ravel()

In [35]:
dataset.isna().sum()

PassengerId      0
Name             0
Survived         0
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Dropping Some Features

In [36]:
dataset.drop(['Cabin' , 'PassengerId' , 'Name' , 'Ticket'] , axis = 1 , inplace = True)

In [37]:
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


## Dropping the rows which still have some null values in them

In [38]:
dataset = dataset.dropna()

In [39]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Sex       889 non-null    object 
 3   Age       889 non-null    float64
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Fare      889 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 62.5+ KB


## Initializing the matrix of features X and the Predicted Values y

In [40]:
X = dataset.iloc[:,1:].values
y = dataset.iloc[:,0].values

# Encoding non-numeric columns

## Encoding the column 'Sex' with LabelEncoder

In [41]:
print(X)

[[3 'male' 22.0 ... 0 7.25 'S']
 [1 'female' 38.0 ... 0 71.2833 'C']
 [3 'female' 26.0 ... 0 7.925 'S']
 ...
 [3 'female' 29.69911764705882 ... 2 23.45 'S']
 [1 'male' 26.0 ... 0 30.0 'C']
 [3 'male' 32.0 ... 0 7.75 'Q']]


In [42]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:,1] = le.fit_transform(X[:,1])

## Encoding the column 'Embarked' with One Hot Encoder

In [43]:
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoder',OneHotEncoder(),[6])], remainder ='passthrough')
X = np.array(ct.fit_transform(X))

## Splitting the dataset into Training and test Data

In [44]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train, y_test = train_test_split(X , y , test_size = 0.2, random_state = 1)

# 1. Multiple Linear Regression

## Training the model

In [45]:
from sklearn.linear_model import LinearRegression
mul_lin_reg = LinearRegression()
mul_lin_reg.fit(X_train, y_train)

LinearRegression()

## Predicting Values

In [46]:
y_pred = mul_lin_reg.predict(X_test)

## Evaluating Performance of the model

In [47]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.49546891501818247

# 2. Polynomial Regression

## Training the model

In [48]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly_reg_fe = PolynomialFeatures(degree = 10)
X_poly = poly_reg_fe.fit_transform(X_train)
poly_reg = LinearRegression()
poly_reg.fit(X_poly, y_train)

LinearRegression()

## Predicting Values

In [49]:
y_pred = poly_reg.predict(poly_reg_fe.transform(X_test))

## Evaluating Performance of the model

In [50]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

-2.863934058076647e+20

# 3. Support Vector Regression

## Feature Scaling

## Training the model

## Predicting Values

## Evaluating Performance of the model

# 4. Decesion Tree Regression

## Training the model

In [51]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(random_state=0)

## Predicting Values

In [52]:
y_pred = regressor.predict(X_test)

## Evaluating Performance of the model

In [53]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.15175563158509764

# 5. Random Forest Regression

## Training the model

In [54]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=0)

## Predicting Values

In [55]:
y_pred = regressor.predict(X_test)

## Evaluating Performance of the model

In [56]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.45513757059316673