# First refer to "Data_Preproccessing" 

## Importing Modules

# First refer to "Data_Preproccessing" 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv("data.csv")

In [3]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,WikiId,Name_wiki,Age_wiki,Hometown,Boarded,Destination,Lifeboat,Body,Class
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,S,691.0,"Braund, Mr. Owen Harris",22.0,"Bridgerule, Devon, England",Southampton,"Qu'Appelle Valley, Saskatchewan, Canada",,,3.0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,C,90.0,"Cumings, Mrs. Florence Briggs (née Thayer)",35.0,"New York, New York, US",Cherbourg,"New York, New York, US",4,,1.0
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,S,865.0,"Heikkinen, Miss Laina",26.0,"Jyväskylä, Finland",Southampton,New York City,14?,,3.0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,S,127.0,"Futrelle, Mrs. Lily May (née Peel)",35.0,"Scituate, Massachusetts, US",Southampton,"Scituate, Massachusetts, US",D,,1.0
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,S,627.0,"Allen, Mr. William Henry",35.0,"Birmingham, West Midlands, England",Southampton,New York City,,,3.0


In [4]:
dataset.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'WikiId', 'Name_wiki',
       'Age_wiki', 'Hometown', 'Boarded', 'Destination', 'Lifeboat', 'Body',
       'Class'],
      dtype='object')

### Removing unnecessary Columns from the dataset

---> Dataset.drop() used for removing any columns from a dataset.

In [5]:
dataset.drop(labels=['PassengerId','WikiId','Name', 'Name_wiki','Ticket','Cabin',
       'Age_wiki', 'Hometown', 'Boarded', 'Destination', 'Lifeboat', 'Body',
       'Class','Fare'], axis=1, inplace=True)
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0.0,3,male,22.0,1,0,S
1,1.0,1,female,38.0,1,0,C
2,1.0,3,female,26.0,0,0,S
3,1.0,1,female,35.0,1,0,S
4,0.0,3,male,35.0,0,0,S


### Taking care of Missing data

In [6]:
dataset.isnull().sum()

Survived    418
Pclass        0
Sex           0
Age         263
SibSp         0
Parch         0
Embarked      2
dtype: int64

##### percentage

In [7]:
for ele in dataset.columns:
    print("{}:  {}%".format(ele, ((dataset[ele].isnull().sum()/len(dataset[ele])*100))))

Survived:  31.932773109243694%
Pclass:  0.0%
Sex:  0.0%
Age:  20.091673032849503%
SibSp:  0.0%
Parch:  0.0%
Embarked:  0.15278838808250572%


Here, I will be using the fillna() method to fill the missing valued in the dataset.

In [8]:
dataset["Age"].fillna(int(dataset["Age"].mean()),inplace=True)

##### __NOTE:__ we will drop all the rows that contains NAN values with respect to the Survived column as survived column is our dependent variable and machine cannot me trained on dependent missing values.

In [9]:
dataset.dropna(subset=["Survived","Embarked"], inplace=True)
dataset.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    0
dtype: int64

### Handling Categorical Values in Dataset

#### __NOTE:__ Here, I have used get_dummies() method in order to encode categorical values.

In [10]:
N_Embarked = pd.get_dummies(dataset["Embarked"], drop_first=True)
N_Sex = pd.get_dummies(dataset["Sex"], drop_first=True)

dataset = pd.concat([dataset,N_Embarked, N_Sex],axis=1)
print(dataset.head())

# We, can remove the Sex and Embarked Columns as we have encoded them.

dataset.drop(columns=["Sex","Embarked"], axis=1, inplace=True)
print(dataset.head())

   Survived  Pclass     Sex   Age  SibSp  Parch Embarked  Q  S  male
0       0.0       3    male  22.0      1      0        S  0  1     1
1       1.0       1  female  38.0      1      0        C  0  0     0
2       1.0       3  female  26.0      0      0        S  0  1     0
3       1.0       1  female  35.0      1      0        S  0  1     0
4       0.0       3    male  35.0      0      0        S  0  1     1
   Survived  Pclass   Age  SibSp  Parch  Q  S  male
0       0.0       3  22.0      1      0  0  1     1
1       1.0       1  38.0      1      0  0  0     0
2       1.0       3  26.0      0      0  0  1     0
3       1.0       1  35.0      1      0  0  1     0
4       0.0       3  35.0      0      0  0  1     1


### Splitting the dataset into Training set and Test set.

Now, before splitting the dataset into taining set and testing set. Make sure to seperate the independent and dependent veriable.

In [11]:
X = dataset.drop(columns=["Survived"],axis=1)
y = dataset["Survived"]

##### Train test splitting

In [12]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Feature Scalling

In [13]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train.iloc[:, 1:3] = sc.fit_transform(X_train.iloc[:, 1:3])
X_test.iloc[:, 1:3] = sc.transform(X_test.iloc[:, 1:3])

In [14]:
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Q,S,male
708,1,-0.588394,-0.474516,0,0,1,0
240,3,-0.054771,0.38178,0,0,0,0
382,3,0.173925,-0.474516,0,0,1,1
792,3,-0.054771,6.375852,2,0,1,0
683,3,-1.19825,3.806964,2,0,1,1


## Regression Models

1. Simple Linear Regression
2. Multiple Linear Regression
3. Polynomial Regression
4. Support Vector for Regression (SVR)
5. Decision Tree Regression
6. Random Forest Regression

### 1. Simple Linear Regression