In [1]:
import pandas as pd
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm
import numpy as np
import numpy.random as nr
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
import math

%matplotlib inline

In [2]:
AWS_df = pd.read_csv('AWS_Dataset_Preped.csv')
print(AWS_df.shape)
AWS_df.head().T

(16404, 22)


Unnamed: 0,0,1,2,3,4
CustomerID,11000,11001,11002,11003,11004
FirstName,Jon,Eugene,Ruben,Christy,Elizabeth
LastName,Yang,Huang,Torres,Zhu,Johnson
AddressLine1,3761 N. 14th St,2243 W St.,5844 Linden Land,1825 Village Pl.,7553 Harness Circle
City,Rockhampton,Seaford,Hobart,North Ryde,Wollongong
StateProvinceName,Queensland,Victoria,Tasmania,New South Wales,New South Wales
CountryRegionName,Australia,Australia,Australia,Australia,Australia
PostalCode,4700,3198,7001,2113,2500
PhoneNumber,1 (11) 500 555-0162,1 (11) 500 555-0110,1 (11) 500 555-0184,1 (11) 500 555-0162,1 (11) 500 555-0131
BirthDate,1966-04-08,1965-05-14,1965-08-12,1968-02-15,1968-08-08


In [3]:
Test_df = pd.read_csv('AWS_Test_Preped.csv')
print(Test_df.shape)
Test_df.head()

(500, 20)


Unnamed: 0,CustomerID,FirstName,LastName,AddressLine1,City,StateProvinceName,CountryRegionName,PostalCode,PhoneNumber,BirthDate,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,Age
0,18988,Courtney,Baker,8727 Buena Vista Ave.,Fremont,California,United States,94536,133-555-0128,1945-01-05,Bachelors,Management,F,S,0,2,0,5,86931,53
1,29135,Adam,Allen,3491 Cook Street,Haney,British Columbia,Canada,V2W 1W2,252-555-0173,1964-10-04,Bachelors,Skilled Manual,M,M,1,2,2,4,100125,34
2,12156,Bonnie,Raji,359 Pleasant Hill Rd,Burbank,California,United States,91502,409-555-0193,1934-01-12,Graduate Degree,Management,F,M,1,2,0,4,103985,64
3,13749,Julio,Alonso,8945 Euclid Ave.,Burlingame,California,United States,94010,175-555-0196,1958-09-22,Graduate Degree,Skilled Manual,M,M,1,0,0,4,127161,40
4,27780,Christy,Andersen,"42, boulevard Tremblay",Dunkerque,Nord,France,59140,1 (11) 500 555-0122,1965-03-19,High School,Manual,F,M,1,1,2,2,21876,33


In [4]:
labels = np.array(AWS_df['AveMonthSpend'])

In [5]:
# Create the numpy feature array or model matrix.
def encode_string(cat_features):
    ## First encode the strings to numeric categories
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_features)
    enc_cat_features = enc.transform(cat_features)
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_features.reshape(-1,1))
    return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()

categorical_columns = ['Gender', 'Occupation', 'Education']

dataset = [AWS_df, Test_df]
Train_Features = encode_string(AWS_df['MaritalStatus'])
Test_Features = encode_string(Test_df['MaritalStatus'])

for data in dataset:
    for col in categorical_columns:
        temp = encode_string(data[col])
        if data is AWS_df:
            Train_Features = np.concatenate([Train_Features, temp], axis = 1)
        else:
            Test_Features = np.concatenate([Test_Features, temp], axis = 1)

print(Train_Features.shape)
print(Train_Features[:2, :])

print(Test_Features.shape)
print(Test_Features[:2, :])

(16404, 14)
[[1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0.]]
(500, 14)
[[0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.]]


In [6]:
# Concatenated the numeric features to the numpy array
Train_Features = np.concatenate([Train_Features, np.array(AWS_df[['YearlyIncome', 'NumberCarsOwned', 'Age']])], axis = 1)
Test_Features = np.concatenate([Test_Features, np.array(Test_df[['YearlyIncome', 'NumberCarsOwned', 'Age']])], axis = 1)

print(Train_Features.shape)
print(Train_Features[:2, :])

print(Test_Features.shape)
print(Test_Features[:2, :])

(16404, 17)
[[1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.37947e+05 0.00000e+00 3.20000e+01]
 [0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.01141e+05 1.00000e+00 3.30000e+01]]
(500, 17)
[[0.00000e+00 1.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 8.69310e+04 2.00000e+00 5.30000e+01]
 [1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.00125e+05 2.00000e+00 3.40000e+01]]


In [7]:
X_train = Train_Features
y_train = labels

X_test = Test_Features

In [8]:
scaler = preprocessing.StandardScaler().fit(X_train[:,14:])
X_train[:,14:] = scaler.transform(X_train[:,14:])
X_test[:,14:] = scaler.transform(X_test[:,14:])
X_train[:2,]

array([[ 1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.50580197,
        -1.31661475, -0.39550754],
       [ 0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.57928966,
        -0.43872887, -0.30654559]])

In [9]:
## define and fit the linear regression model
lin_mod = linear_model.LinearRegression(fit_intercept = False)
lin_mod.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [10]:
print(lin_mod.intercept_)
print(lin_mod.coef_)

0.0
[ 7.29793401e+12  7.29793401e+12  1.05421952e+13  1.05421952e+13
  1.13961956e+11  1.13961956e+11  1.13961956e+11  1.13961956e+11
  1.13961956e+11 -1.79540912e+13 -1.79540912e+13 -1.79540912e+13
 -1.79540912e+13 -1.79540912e+13  1.71999112e+01  4.37261284e+00
 -2.71589622e+00]


In [12]:
solution = lin_mod.predict(X_test)
np.savetxt('Regression_Solution1.csv', solution, delimiter=',')

my_submission = pd.DataFrame({'CustomerID':Test_df.CustomerID,'AveMonthSpend': solution})
my_submission.to_csv('Regression_Solution2.csv', index=False)