In [3]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np

# Exploring
import scipy.stats as stats

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling

pd.options.display.float_format = '{:20,.2f}'.format
%matplotlib inline

In [2]:
import seaborn as sns
sns.set(style="white")

# Load the example mpg dataset
mpg = sns.load_dataset("mpg")
mpg.horsepower.value_counts()

150.0    22
90.0     20
88.0     19
110.0    18
100.0    17
95.0     14
75.0     14
67.0     12
105.0    12
70.0     12
65.0     10
85.0      9
97.0      9
145.0     7
140.0     7
80.0      7
68.0      6
72.0      6
84.0      6
78.0      6
92.0      6
175.0     5
115.0     5
180.0     5
60.0      5
86.0      5
130.0     5
71.0      5
170.0     5
165.0     4
         ..
82.0      1
54.0      1
102.0     1
64.0      1
132.0     1
77.0      1
142.0     1
135.0     1
138.0     1
133.0     1
103.0     1
66.0      1
89.0      1
149.0     1
108.0     1
152.0     1
208.0     1
148.0     1
93.0      1
61.0      1
122.0     1
91.0      1
49.0      1
230.0     1
116.0     1
94.0      1
167.0     1
158.0     1
137.0     1
107.0     1
Name: horsepower, Length: 93, dtype: int64

In [4]:
import env

def get_connection(db, user=env.user, host=env.host, password=env.password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

df = pd.read_sql('SELECT * FROM elo_db.transactions limit 100000;', 
                 get_connection('elo_db'))

df.head()

Unnamed: 0,transaction_id,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.7,2017-06-25 15:33:07,1.0,16,37
1,1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.73,2017-07-15 12:10:45,1.0,16,16
2,2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.72,2017-08-09 22:04:29,1.0,16,37
3,3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.74,2017-09-02 10:06:26,1.0,16,34
4,4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.72,2017-03-10 01:14:19,1.0,16,37


In [5]:
df.authorized_flag.value_counts()

Y    93002
N     6998
Name: authorized_flag, dtype: int64

In [6]:
df.describe(include='all')

Unnamed: 0,transaction_id,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
count,100000.0,100000,100000,100000.0,100000,100000.0,99664,100000.0,99528,100000.0,100000.0,100000,92369.0,100000.0,100000.0
unique,,2,463,,2,,3,,20840,,,96942,,,
top,,Y,C_ID_9e6b3e491a,,N,,A,,M_ID_00a6ca8a8a,,,2017-11-17 00:00:00,,,
freq,,93002,1300,,93893,,56005,,3752,,,90,,,
mean,49999.5,,,126.21,,0.54,,473.23,,-5.06,-0.34,,2.22,10.93,27.51
std,28867.66,,,98.29,,0.96,,248.37,,3.73,46.32,,1.55,6.22,9.54
min,0.0,,,-1.0,,-1.0,,-1.0,,-13.0,-0.75,,1.0,-1.0,-1.0
25%,24999.75,,,57.0,,0.0,,278.0,,-8.0,-0.72,,1.0,9.0,19.0
50%,49999.5,,,88.0,,0.0,,451.0,,-4.0,-0.69,,1.0,9.0,33.0
75%,74999.25,,,189.0,,1.0,,705.0,,-2.0,-0.62,,3.0,16.0,34.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 15 columns):
transaction_id          100000 non-null int64
authorized_flag         100000 non-null object
card_id                 100000 non-null object
city_id                 100000 non-null int64
category_1              100000 non-null object
installments            100000 non-null int64
category_3              99664 non-null object
merchant_category_id    100000 non-null int64
merchant_id             99528 non-null object
month_lag               100000 non-null int64
purchase_amount         100000 non-null float64
purchase_date           100000 non-null object
category_2              92369 non-null float64
state_id                100000 non-null int64
subsector_id            100000 non-null int64
dtypes: float64(2), int64(7), object(6)
memory usage: 11.4+ MB


In [None]:
#### Impute the Mode
from sklearn.impute import SimpleImputer
import numpy as np
loc_df = df[[]]

def encode_embarked(df):
    encoder = LabelEncoder()
    encoder.fit(df.embarked)
    return df.assign(embarked_encode = encoder.transform(df.embarked))


zip_df = df.regionidzip.value_counts().head(100).reset_index().rename(columns = {'index': 'regionidzip','regionidzip': 'count'})

# df_reduced = pd.merge([df, zip_df], axis=1, join='inner')

pd.merge(df, zip_df, how='inner', on='regionidzip')


fig, axes = plt.subplots(figsize=(15,15))

plt.scatter(df['latitude'], df['longitude'],
           c=df.regionidzip, cmap='tab20')

plt.show()