# Import Dataset




In [43]:
import pandas as pd
import numpy as np

In [93]:
titanic = pd.read_csv('/content/Titanic-Dataset.csv')

# Look at Keys/Column names of the dataframe

In [94]:
titanic.keys()

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [96]:
#506 instances, meaning 506 records
#the fourteenth attribute is stored in "target"
#describes the different features of the thirteen attributes
titanic["Survived"]

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

# View Target Data

In [97]:
#unlike classifier models, the answers are given in numerical values
titanic['target'] = titanic['Survived']
titanic['target']

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: target, Length: 891, dtype: int64

In [98]:
titanic.dropna(inplace=True)

# Load Data in Variables X, Y

In [99]:
X = titanic[['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']]

In [100]:
X

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


In [101]:
y = titanic['target']
y

1      1
3      1
6      0
10     1
11     1
      ..
871    1
872    0
879    1
887    1
889    1
Name: target, Length: 183, dtype: int64

In [102]:
# Next we'll drop 'useless' columns (non numerical columns and ones that can't be converted into dummy variable useful forms)
X.drop(['Name','Ticket','Cabin'], axis=1, inplace=True)
X.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(['Name','Ticket','Cabin'], axis=1, inplace=True)


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
1,2,1,1,female,38.0,1,0,71.2833,C
3,4,1,1,female,35.0,1,0,53.1,S
6,7,0,1,male,54.0,0,0,51.8625,S
10,11,1,3,female,4.0,1,1,16.7,S
11,12,1,1,female,58.0,0,0,26.55,S


In [103]:
# Now we'll convert the port embarked and sex into numerical-categorical variables
embark = pd.get_dummies(X['Embarked'], drop_first=True)
sex = pd.get_dummies(X['Sex'], drop_first=True)

In [104]:
embark.head()

Unnamed: 0,Q,S
1,0,0
3,0,1
6,0,1
10,0,1
11,0,1


In [105]:
sex.head()

Unnamed: 0,male
1,0
3,0
6,1
10,0
11,0


In [106]:
# Drop Embarked and Sex columns as we will add our dummy-versions shortly
X.drop(['Embarked', 'Sex'], axis=1, inplace=True)
X = pd.concat([X, sex, embark], axis=1)
X.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(['Embarked', 'Sex'], axis=1, inplace=True)


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
1,2,1,1,38.0,1,0,71.2833,0,0,0
3,4,1,1,35.0,1,0,53.1,0,0,1
6,7,0,1,54.0,0,0,51.8625,1,0,1
10,11,1,3,4.0,1,1,16.7,0,0,1
11,12,1,1,58.0,0,0,26.55,0,0,1


# Split Data into Training and Testing Data

In [107]:
#33 percent of our data is for testing
#67 percent of our data is for training
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.33,
                                                    random_state = 42)

# Check Percentage Distributions

In [108]:
#67 percent of 506 is 339
X_train.shape

(122, 10)

In [109]:
#33 percent of 506 is 167
X_test.shape

(61, 10)

In [110]:
y_train.shape

(122,)

In [111]:
y_test.shape

(61,)

# Import Linear Regression Model from Sci-kit Learn

In [112]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression() # create an instance of a Linear Regression model
lr.fit(X_train, y_train) # use training data and training answers to train the model using .fit

# Make Predictions

In [113]:
#call predict method on test data
y_test_predict_vals = lr.predict(X_test)
y_test_predict_vals #these are the predicted values.

array([-2.18552499e-15,  1.93781574e-14,  1.00000000e+00, -8.69239483e-16,
        1.00000000e+00,  5.05735554e-15, -1.58103470e-16,  1.00000000e+00,
        1.00000000e+00,  3.37770190e-15,  3.63760935e-15,  1.00000000e+00,
        2.25826610e-15,  1.00000000e+00, -2.61747391e-15,  1.00000000e+00,
        1.00000000e+00,  1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
        1.00000000e+00,  1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
        2.36253571e-15,  1.00000000e+00,  1.00000000e+00,  2.46652166e-15,
        1.00000000e+00,  1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
        1.20089925e-15,  1.00000000e+00, -5.45468534e-16,  2.43933099e-15,
        1.00000000e+00,  2.31329554e-15, -3.14037400e-15,  1.00000000e+00,
        2.48631637e-14,  1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
        1.00000000e+00,  2.49090840e-15,  1.00000000e+00,  1.00000000e+00,
        1.00000000e+00,  1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
        3.23741551e-15,  

In [114]:
y_test

118    0
251    0
742    1
544    0
712    1
      ..
741    0
435    1
789    0
151    1
772    0
Name: target, Length: 61, dtype: int64

# Implement R-Square Score

In [115]:
from sklearn.metrics import mean_squared_error, r2_score
mean_squared_error(y_test, y_test_predict_vals)

5.343660607150629e-29

In [116]:
#difference between answers related to difference between the mean of actual answers
#the closer score is to 1, the better
r2_score(y_test, y_test_predict_vals)

1.0

Yay!! Unfortunately dataset had a lot of rows with NA entries, in the future values could be inputed/interpolated. Also, in hindsight, I should have done dummy-variable stuff BEFORE dropping any observations with NA values.