In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from scipy import stats

In [3]:
# https://stackoverflow.com/questions/21546739/load-data-from-txt-with-pandas
#https://stackoverflow.com/questions/40011531/in-pandas-when-using-read-csv-how-to-assign-a-nan-to-a-value-thats-not-the



# Read data, change missing values to 'NaN'

Xtrain_raw = pd.read_csv('Xtrain.txt', sep =" ", index_col=0, dtype='float64',
                    na_values=['?','NaN'])
Xtest_raw = pd.read_csv('Xtest.txt', sep =" ", index_col=0, dtype='float64',
                    na_values=['?','NaN'])

Ytrain_raw = pd.read_csv('Ytrain.txt', sep =",", index_col=0, dtype='float64',
                    na_values=['?','NaN'])

In [4]:
Ytrain_raw.shape

(153287, 1)

In [5]:
Xtrain_raw.shape

(153287, 75)

In [6]:
Xtest_raw.shape


(50000, 75)

In [7]:
#https://stackoverflow.com/questions/51207491/function-to-replace-nan-values-in-a-dataframe-with-mean-of-the-related-column
#Replace NaN with column means in Xtrain
Xtrain_filled = Xtrain_raw.apply(lambda x: x.fillna(x.mean())) 
#print(Xtrain_filled.info())

In [8]:
#Do the same for other data-sets

Ytrain_filled = Ytrain_raw.apply(lambda x: x.fillna(x.mean())) 
Xtest_filled = Xtest_raw.apply(lambda x: x.fillna(x.mean())) 

In [9]:
Xtest_filled['#H04'].head

<bound method NDFrame.head of Id
1.0        -9.0
6.0        -9.0
9.0        -9.0
12.0       -9.0
13.0       -9.0
           ... 
177717.0   -9.0
177718.0   -9.0
177726.0   -9.0
177728.0   -9.0
177731.0   -9.0
Name: #H04, Length: 50000, dtype: float64>

In [10]:
Xtest_clean=Xtest_filled.drop(labels=[ '#H04', '#H05', '#H06', '#H07', '#H08', '#H09','#I01', '#I02', '#I03','#I04'], axis=1)
Xtrain_clean=Xtrain_filled.drop(labels=[ '#H04', '#H05', '#H06', '#H07', '#H08', '#H09','#I01', '#I02', '#I03','#I04'], axis=1)

In [11]:
#Concatenate the training and test sets and check to make sure that all observations from the dataset are there

X_filled = pd.concat([Xtrain_clean, Xtest_clean])
print('X_filled shape: ', Xtest_filled.shape)
X_filled.head

X_filled shape:  (50000, 75)


<bound method NDFrame.head of               #A01      #A02      #B01      #B02      #B03      #B04  \
Id                                                                     
2.0      -0.067006  2.000400 -0.181087 -0.288238 -0.026722 -1.332320   
3.0      -0.238565 -1.755040 -0.562340 -1.943580  0.135810 -0.027425   
4.0      -0.900302  0.760359 -0.566809  0.843982  0.902949 -0.298859   
5.0       0.448940 -0.530155 -0.723218  0.014280  0.175458  0.932137   
7.0      -0.096147  0.123017 -0.269223  0.229115 -0.041637 -0.070465   
...            ...       ...       ...       ...       ...       ...   
177717.0  2.202510  2.295670 -0.813234 -0.571600  1.107120 -0.744815   
177718.0  0.486308  0.846261 -1.108650  1.051280 -0.037331  0.511571   
177726.0 -0.184787  0.372172  2.014950 -0.135052 -0.037331 -0.022866   
177728.0 -0.353409 -0.674765  0.227447 -1.229060 -0.037331  0.403753   
177731.0  0.469434  0.458546  0.324099 -0.351323  0.888350 -1.201130   

              #B05      #B06     

In [12]:
X_sorted = X_filled.sort_index(axis=0)
#X_sorted.info()

In [13]:
#Merge training data sets
train_merged= Xtrain_clean.merge(Ytrain_filled, how='left', on='Id')
print(train_merged)

              #A01      #A02      #B01      #B02      #B03      #B04  \
Id                                                                     
2.0      -0.067006  2.000400 -0.181087 -0.288238 -0.026722 -1.332320   
3.0      -0.238565 -1.755040 -0.562340 -1.943580  0.135810 -0.027425   
4.0      -0.900302  0.760359 -0.566809  0.843982  0.902949 -0.298859   
5.0       0.448940 -0.530155 -0.723218  0.014280  0.175458  0.932137   
7.0      -0.096147  0.123017 -0.269223  0.229115 -0.041637 -0.070465   
...            ...       ...       ...       ...       ...       ...   
177724.0 -0.341901 -0.307778  0.599235  0.580111 -0.033728 -0.038976   
177725.0  1.284220  0.592979 -0.621695 -0.009792 -0.382274 -0.664959   
177727.0  0.454239  1.059110 -0.034778 -0.297135  0.171074 -0.324943   
177729.0 -0.336121  0.218963  0.985771 -0.834321 -1.452110 -0.290119   
177730.0 -0.202946  0.004449 -0.403091  0.019818  1.420010 -1.087600   

              #B05      #B06      #B07      #B08  ...      #F10

In [14]:
#Remove the Id column and reset the index

train_merged_resetindex = train_merged.reset_index()


In [15]:
train_merged_noindex = train_merged_resetindex.drop(['Id'], axis=1)
#print(train_merged_noindex)

In [16]:
#Modeling [some code reused from project 2]
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

In [17]:
y = train_merged['Value'].copy()
X = train_merged.drop(['Value'], axis=1).copy()

In [18]:
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=32)

In [19]:
# Linear Regression 

LinModel = LinearRegression().fit(x_train, y_train)
Lin_predictions = LinModel.predict(x_test)
mean_squared_error(y_test, Lin_predictions, squared=False)

9.641221097353835

In [65]:
#Lin_predictions = LinModel.predict(x_test)


In [66]:
#mean_squared_error(y_test, Lin_predictions, squared=False)

9.641221097353835

In [68]:
Xtest_Lin_predictions = LinModel.predict(Xtest_clean)


In [69]:
#Load the sample predictions
pred = pd.read_csv('pred.txt', sep =",", index_col=0, dtype= {'Value': np.float64})        

In [70]:
print(pred)

        prediction
Id                
1             10.0
6             10.0
9             10.0
12            10.0
13            10.0
...            ...
177717        10.0
177718        10.0
177726        10.0
177728        10.0
177731        10.0

[50000 rows x 1 columns]


In [71]:
sassa_pred_1 = pred.copy()
sassa_pred_1['prediction'] = Xtest_Lin_predictions

In [72]:
sassa_pred_1.to_csv('/Users/Jamie/sassa_pred_1.csv', index = True)

In [None]:
#Xtrain_raw.sample
sns.pairplot(Xtrain_raw.sample(n = 2, random_state=1), height = 2)

In [4]:
mu = Ytrain_raw.mean(axis=0)
std = Ytrain_raw.std(axis=0)
#mu
std

Value    10.656328
dtype: float64

In [None]:
#Xtrain_raw_copy= Xtrain_raw.copy()
#Xtrain_droppedNA = Xtrain_raw_copy.dropna()
Xtrain_droppedNA = Xtrain_raw.dropna()


In [None]:
hasattr(Xtrain_droppedNA, 'sample')

In [None]:
#scaler = StandardScaler()
Xtrain_droppedNA = scaler.fit_transform(Xtrain_droppedNA)
#df_numeric[df_numeric.columns] = scaler.fit_transform(df_numeric[df_numeric.columns])
#print(Xtrain_droppedNA['#A01'].mean())
#print(Xtrain_droppedNA['total'].std())

plt.figure(figsize=(50,40))
#cor = Xtrain_droppedNA.corr(method = 'pearson')
#sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

sns.pairplot(Xtrain_droppedNA.sample(n = 110, random_state=1), height = 2)