### Importing Libraries & Loading in Data

In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [101]:
train.isna().sum() # finding missing values

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Handeling Missing Values

In [102]:
# "M" values indicate "Missing"
train['Embarked'] = train['Embarked'].fillna('M') 
test['Embarked'] = test['Embarked'].fillna('M') 

# Assuming missing values were of averagely aged individuals
train['Age'] = train['Age'].fillna(29.699)
test['Age'] = test['Age'].fillna(29.699)

In [103]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
e = LabelEncoder()

# encode Sex into numerical values: 1=male 0=female
train['sex_encoded'] = e.fit_transform(train['Sex'])
test['sex_encoded'] = e.fit_transform(test['Sex'])

# encode embarked
train['embark_ec'] = e.fit_transform(train['Embarked'])
test['embark_ec'] = e.fit_transform(test['Embarked'])

### Defining Independent and Dependent Variables

In [105]:
X = np.array(train[['Pclass', 'Age', 'sex_encoded', 'embark_ec']])
y = np.array(train['Survived'])

test_X = test[['Pclass', 'Age', 'sex_encoded', 'embark_ec']]

from sklearn import linear_model # fitting the model
reg = linear_model.LinearRegression()
reg.fit(X,y)

LinearRegression()

### Creating predictions

In [106]:
test_y = reg.predict(test_X)
print(test_y)

[ 0.10133476  0.50461309  0.14255555  0.11026681  0.62853559  0.17470652
  0.61712509  0.29275926  0.70485259  0.14000822  0.09688814  0.37115681
  0.97864979  0.10935396  0.85968418  0.85264673  0.27639186  0.19649761
  0.60375109  0.57101628  0.3830341   0.19949102  0.94544351  0.5515687
  0.91121667 -0.00374189  1.04009608  0.18906226  0.39594131  0.15337753
  0.17379366  0.30267306  0.57400969  0.59037242  0.5069566   0.20888986
  0.59037242  0.63349249  0.12018061  0.09688814  0.10727341  0.45195923
  0.04087021  0.76641594  0.86959798  0.12018061  0.4326031   0.12513284
  0.85173387  0.55913899  0.48020861  0.34429175  0.81598494  0.95386528
  0.33091308  0.22277881  0.07061161  0.12018061  0.09688814  0.97069948
  0.15983582  0.26301786  0.15487892  0.65678029  0.5912239   0.76790796
  0.67660789  0.36619991  0.5019997   0.79524448  0.64686649  0.14000822
  0.59383729  0.51439195  0.97565638  0.49456435  0.09688814  0.82002898
  0.27293166  0.64686649  0.27085111  0.2670619   0.

In [108]:
predictions = {}
for index, pred in enumerate(test_y):
    key = test['PassengerId'][index]
    if pred < .5:
        value = 0
    else:
        value = 1
    predictions[key] = value

predictions

{892: 0,
 893: 1,
 894: 0,
 895: 0,
 896: 1,
 897: 0,
 898: 1,
 899: 0,
 900: 1,
 901: 0,
 902: 0,
 903: 0,
 904: 1,
 905: 0,
 906: 1,
 907: 1,
 908: 0,
 909: 0,
 910: 1,
 911: 1,
 912: 0,
 913: 0,
 914: 1,
 915: 1,
 916: 1,
 917: 0,
 918: 1,
 919: 0,
 920: 0,
 921: 0,
 922: 0,
 923: 0,
 924: 1,
 925: 1,
 926: 1,
 927: 0,
 928: 1,
 929: 1,
 930: 0,
 931: 0,
 932: 0,
 933: 0,
 934: 0,
 935: 1,
 936: 1,
 937: 0,
 938: 0,
 939: 0,
 940: 1,
 941: 1,
 942: 0,
 943: 0,
 944: 1,
 945: 1,
 946: 0,
 947: 0,
 948: 0,
 949: 0,
 950: 0,
 951: 1,
 952: 0,
 953: 0,
 954: 0,
 955: 1,
 956: 1,
 957: 1,
 958: 1,
 959: 0,
 960: 1,
 961: 1,
 962: 1,
 963: 0,
 964: 1,
 965: 1,
 966: 1,
 967: 0,
 968: 0,
 969: 1,
 970: 0,
 971: 1,
 972: 0,
 973: 0,
 974: 0,
 975: 0,
 976: 0,
 977: 0,
 978: 1,
 979: 1,
 980: 1,
 981: 0,
 982: 1,
 983: 0,
 984: 1,
 985: 0,
 986: 1,
 987: 0,
 988: 1,
 989: 0,
 990: 1,
 991: 0,
 992: 1,
 993: 0,
 994: 0,
 995: 0,
 996: 1,
 997: 0,
 998: 0,
 999: 0,
 1000: 0,
 1001: 0,
 1002: 0

In [112]:
sub_df = pd.DataFrame(predictions.items(), columns=['PassengerId', 'Survived'])
sub_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [114]:
sub_df.to_csv('submission1.csv',index=False)