-
Notifications
You must be signed in to change notification settings - Fork 0
/
020-titanic-predict-logic-regression.py
90 lines (72 loc) · 3.99 KB
/
020-titanic-predict-logic-regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os, pickle
import pandas as pd
import titanic_utils as tu
from sklearn import linear_model, preprocessing
from askew_utils import DF_Magic as dfm
'''This module uses Regression models, meaning the data is ordered and continuous.'''
try:
if os.path.exists("010-test_lowercase_cols.pickle"): #020-train_cleaned_data-lowercase_cols.pickle"):
with open("010-test_lowercase_cols.pickle", 'rb') as in_file:
test = pickle.load(in_file)
print("Loading 010-test_lowercase_cols.pickle")
except:
raise Exception ("Unable to open 010-test_lowercase_cols.pickle")
try:
if os.path.exists("010-training_data_lowercase_cols.pickle"): #020-train_cleaned_data-lowercase_cols.pickle"):
with open("010-training_data_lowercase_cols.pickle", 'rb') as in_file:
training_data = pickle.load(in_file)
print("Loading 010-training_data_lowercase_cols.pickle")
except:
raise Exception ("Unable to open 010-training_data_lowercase_cols.pickle")
try:
if os.path.exists("010-target_lowercase_cols.pickle"): #020-train_cleaned_data-lowercase_cols.pickle"):
with open("010-target_lowercase_cols.pickle", 'rb') as in_file:
target = pickle.load(in_file)
print("Loading 010-target_lowercase_cols.pickle")
except:
raise Exception ("Unable to open 010-targetlowercase_cols.pickle")
if os.path.exists("010-train_lowercase_cols.pickle"):
with open("010-train_lowercase_cols.pickle", 'rb') as in_file:
train = pickle.load(in_file)
print("Loading 010-train_lowercase_cols.pickle")
else:
train = dfm.get_df('train.csv')
titanic_utils.clean_data(train) # We wrote this, a separate member named utils.py
tu.clean_data(train)
target = train["survived"].values # Desired output, usually named target
#feature_names = ["pclass", "age", "sex", "fare", "title", "sibsp", "embarked", "cabin", "familysize"]# adding "title" (a generated column) enhanced accuracy
feature_names = ["pclass", "age", "sex", "fare", "title", "embarked", "cabin", "familysize"]# adding "title" (a generated column) enhanced accuracy
features = train[feature_names].values #Hints
print("#------------------------------#")
print("# Value_counts for train")
print("#------------------------------#")
print(train.columns)
print(test.columns)
classifier = linear_model.LogisticRegression() # Classify which bucket we need to assign. LogisticRegressioin yields twice as linearRegression
classifier_ = classifier.fit(features, target) # Find hidden relationships in data. Goes thru every row in data for sniffing relationships
test_data = test.drop(columns=(['passengerid' ,'sibsp']), axis = 1).copy()
prediction = classifier_.predict(test_data)
import pandas as pd
submission = pd.DataFrame({
"PassengerID":test['passengerid'],
"Survived":prediction
})
submission.set_index('PassengerID', inplace = True)
submission.to_csv('submission_LogisticRegression.csv')
print("")
print("#------------------------------#")
print("# Accuracy BASELINE with LogisticRegression: ", end = '')
print(classifier_.score(features, target)) # Yeilds 80.2%. Adding more values to features, lowers score
print("#------------------------------#")
##################################
# Does okay job, but data lies more at at arc and not a line, so we do polynomials
##################################
poly = preprocessing.PolynomialFeatures(degree =2) # setting degree=1, returns same value as above.
poly_features = poly.fit_transform(features)
classifier_ = classifier.fit(poly_features, target)
print("\n#------------------------------#")
print("# Accuracy BASELINE with PolynomialFeatures: ", end = '')
print(classifier_.score(poly_features, target))
print("#------------------------------#")
with open("020-train_cleaned_data-lowercase_cols.pickle", "wb") as in_file: #Pickle saves results as reuable object
pickle.dump(train, in_file) #Save results from above to Pickle.