In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ML imports
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.linear_model import LogisticRegression

%matplotlib inline



In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [4]:
data.shape

(550068, 12)

In [5]:
# How many different User_ID 's
len(data.User_ID.value_counts())

5891

In [6]:
# Check for NaN vals in User_ID
data.User_ID.isnull().any()

False

In [7]:
# How many different Product_ID 's
len(data.Product_ID.value_counts())

3631

In [8]:
# Check for NaN vals in Product_ID 's
data.Product_ID.isnull().any()

False

In [9]:
# Distribution of Gender
data.Gender.value_counts()

M    414259
F    135809
Name: Gender, dtype: int64

In [10]:
# Check for NaN vals in Gender
data.Gender.isnull().any()

False

In [11]:
# Ratio of Male to Female purchases.
print("Male Purchase Count / Female Purchase Count = ", data.Gender.value_counts()[0] / data.Gender.value_counts()[1])

Male Purchase Count / Female Purchase Count =  3.05030594438


In [12]:
# City Categories
data.City_Category.value_counts()

B    231173
C    171175
A    147720
Name: City_Category, dtype: int64

In [13]:
# Check for NaN vals in Citey_Category
data.City_Category.isnull().any()

False

In [14]:
# Age
data.Age.value_counts()

26-35    219587
36-45    110013
18-25     99660
46-50     45701
51-55     38501
55+       21504
0-17      15102
Name: Age, dtype: int64

In [15]:
# Check for NaN vals in Citey_Category
data.Age.isnull().any()

False

In [16]:
# Occupation
data.Occupation.value_counts()

4     72308
0     69638
7     59133
1     47426
17    40043
20    33562
12    31179
14    27309
2     26588
16    25371
6     20355
3     17650
10    12930
5     12177
15    12165
11    11586
19     8461
13     7728
18     6622
9      6291
8      1546
Name: Occupation, dtype: int64

In [17]:
# Check for NaN vals in Occupation
data.Occupation.isnull().any()

False

In [18]:
data.columns

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase'],
      dtype='object')

In [19]:
''' Auto-EDA , because lazy. '''
features = data.columns
missingvalfeatures = []

for feature in features:
    print("* ",feature, " *")
    valcount = len(data[feature].value_counts())
    print("Value Count: ", valcount)
    print("Missing Values? ", data[feature].isnull().any())
    if (data[feature].isnull().any()):
        missingvalfeatures.append(feature)
    print(' ')
print("Found ", len(missingvalfeatures), "features with missing values!")

*  User_ID  *
Value Count:  5891
Missing Values?  False
 
*  Product_ID  *
Value Count:  3631
Missing Values?  False
 
*  Gender  *
Value Count:  2
Missing Values?  False
 
*  Age  *
Value Count:  7
Missing Values?  False
 
*  Occupation  *
Value Count:  21
Missing Values?  False
 
*  City_Category  *
Value Count:  3
Missing Values?  False
 
*  Stay_In_Current_City_Years  *
Value Count:  5
Missing Values?  False
 
*  Marital_Status  *
Value Count:  2
Missing Values?  False
 
*  Product_Category_1  *
Value Count:  20
Missing Values?  False
 
*  Product_Category_2  *
Value Count:  17
Missing Values?  True
 
*  Product_Category_3  *
Value Count:  15
Missing Values?  True
 
*  Purchase  *
Value Count:  18105
Missing Values?  False
 
Found  2 features with missing values!


In [20]:
# Culprits
missingvalfeatures

['Product_Category_2', 'Product_Category_3']

In [21]:
# Whats the damage - Product_Category_2
len(data[data.Product_Category_2.isnull()])

173638

In [22]:
print("Missing: ", (len(data[data.Product_Category_2.isnull()]) / len(data.Product_Category_2))*100, "%")

Missing:  31.56664266963357 %


In [23]:
# Whats the damage - Product_Category_3
len(data[data.Product_Category_3.isnull()])

383247

In [24]:
print("Missing: ", (len(data[data.Product_Category_3.isnull()]) / len(data.Product_Category_3))*100, "%")

Missing:  69.67265865311198 %


In [25]:
# Product 2 vals
data.Product_Category_2.value_counts()

8.0     64088
14.0    55108
2.0     49217
16.0    43255
15.0    37855
5.0     26235
4.0     25677
6.0     16466
11.0    14134
17.0    13320
13.0    10531
9.0      5693
12.0     5528
10.0     3043
3.0      2884
18.0     2770
7.0       626
Name: Product_Category_2, dtype: int64

In [26]:
# Product 3 vals
data.Product_Category_3.value_counts()

16.0    32636
15.0    28013
14.0    18428
17.0    16702
5.0     16658
8.0     12562
9.0     11579
12.0     9246
13.0     5459
6.0      4890
18.0     4629
4.0      1875
11.0     1805
10.0     1726
3.0       613
Name: Product_Category_3, dtype: int64

In [27]:
features

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase'],
      dtype='object')

In [28]:
# Not dealing with it for now
data.drop('User_ID',axis=1,inplace=True)

In [29]:
y = data.pop('Purchase')
X = data

In [30]:
print(y.shape)
print(X.shape)

(550068,)
(550068, 10)


In [31]:
X.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,P00069042,F,0-17,10,A,2,0,3,,
1,P00248942,F,0-17,10,A,2,0,1,6.0,14.0
2,P00087842,F,0-17,10,A,2,0,12,,
3,P00085442,F,0-17,10,A,2,0,12,14.0,
4,P00285442,M,55+,16,C,4+,0,8,,


In [32]:
categorical_variables = X.columns

for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)

In [33]:
# Ensure NaN is gone across the board.
X.isnull().any()

Product_ID_P00000142          False
Product_ID_P00000242          False
Product_ID_P00000342          False
Product_ID_P00000442          False
Product_ID_P00000542          False
Product_ID_P00000642          False
Product_ID_P00000742          False
Product_ID_P00000842          False
Product_ID_P00000942          False
Product_ID_P00001042          False
Product_ID_P00001142          False
Product_ID_P00001242          False
Product_ID_P00001342          False
Product_ID_P00001442          False
Product_ID_P00001542          False
Product_ID_P00001642          False
Product_ID_P00001742          False
Product_ID_P00001842          False
Product_ID_P00001942          False
Product_ID_P00002042          False
Product_ID_P00002142          False
Product_ID_P00002242          False
Product_ID_P00002342          False
Product_ID_P00002442          False
Product_ID_P00002542          False
Product_ID_P00002642          False
Product_ID_P00002742          False
Product_ID_P00002842        

In [44]:
for col in X.columns:
    print(col)

Product_ID_P00000142
Product_ID_P00000242
Product_ID_P00000342
Product_ID_P00000442
Product_ID_P00000542
Product_ID_P00000642
Product_ID_P00000742
Product_ID_P00000842
Product_ID_P00000942
Product_ID_P00001042
Product_ID_P00001142
Product_ID_P00001242
Product_ID_P00001342
Product_ID_P00001442
Product_ID_P00001542
Product_ID_P00001642
Product_ID_P00001742
Product_ID_P00001842
Product_ID_P00001942
Product_ID_P00002042
Product_ID_P00002142
Product_ID_P00002242
Product_ID_P00002342
Product_ID_P00002442
Product_ID_P00002542
Product_ID_P00002642
Product_ID_P00002742
Product_ID_P00002842
Product_ID_P00002942
Product_ID_P00003042
Product_ID_P00003142
Product_ID_P00003242
Product_ID_P00003342
Product_ID_P00003442
Product_ID_P00003542
Product_ID_P00003642
Product_ID_P00003742
Product_ID_P00003842
Product_ID_P00003942
Product_ID_P00004042
Product_ID_P00004142
Product_ID_P00004242
Product_ID_P00004342
Product_ID_P00004442
Product_ID_P00004542
Product_ID_P00004642
Product_ID_P00004742
Product_ID_P0

Product_ID_P00156742
Product_ID_P00156842
Product_ID_P00157042
Product_ID_P00157142
Product_ID_P00157242
Product_ID_P00157342
Product_ID_P00157442
Product_ID_P00157542
Product_ID_P00157642
Product_ID_P00157842
Product_ID_P00157942
Product_ID_P00158042
Product_ID_P00158142
Product_ID_P00158242
Product_ID_P00158342
Product_ID_P00158442
Product_ID_P00158542
Product_ID_P00158642
Product_ID_P00158742
Product_ID_P00158842
Product_ID_P00158942
Product_ID_P00159042
Product_ID_P00159142
Product_ID_P00159242
Product_ID_P00159342
Product_ID_P00159442
Product_ID_P00159542
Product_ID_P00159642
Product_ID_P00159742
Product_ID_P00159842
Product_ID_P00159942
Product_ID_P00160042
Product_ID_P00160142
Product_ID_P00160242
Product_ID_P00160342
Product_ID_P00160442
Product_ID_P00160542
Product_ID_P00160642
Product_ID_P00160742
Product_ID_P00160842
Product_ID_P00160942
Product_ID_P00161042
Product_ID_P00161142
Product_ID_P00161242
Product_ID_P00161342
Product_ID_P00161442
Product_ID_P00161542
Product_ID_P0

In [33]:
X.drop('City_Category_A',axis=1,inplace=True)
X.drop('City_Category_C',axis=1,inplace=True)
X.drop('Gender_F',axis=1,inplace=True)
X.drop('Marital_Status_0',axis=1,inplace=True)

In [34]:
scaler = MaxAbsScaler()

In [35]:
X_scaled = scaler.fit_transform(X)

In [36]:
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.2,random_state=42)

In [37]:
model = LogisticRegression(penalty='l2', C=1, solver='newton-cg',fit_intercept=True,max_iter=100)

In [38]:
# Fit this possible_X to model
model.fit(X_train,y_train)

KeyboardInterrupt: 