# Libraries & Settings

In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Acquire Data

In [2]:
df_test = pd.read_csv("./Data/test.csv")
df_train = pd.read_csv("./Data/train.csv")

In [3]:
back_up = df_train

In [4]:
df_train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1



### Comparing Columns

In [5]:
df_train.info()
print('_'*40)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55423856 entries, 0 to 55423855
Data columns (total 8 columns):
 #   Column             Dtype  
---  ------             -----  
 0   key                object 
 1   fare_amount        float64
 2   pickup_datetime    object 
 3   pickup_longitude   float64
 4   pickup_latitude    float64
 5   dropoff_longitude  float64
 6   dropoff_latitude   float64
 7   passenger_count    int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 3.3+ GB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9914 entries, 0 to 9913
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   key                9914 non-null   object 
 1   pickup_datetime    9914 non-null   object 
 2   pickup_longitude   9914 non-null   float64
 3   pickup_latitude    9914 non-null   float64
 4   dropoff_longitude  9914 non-null   float64
 5   dropoff_latitude

In [6]:

# find different column
train_columns = pd.DataFrame({"Diff_Column": df_train.columns,
                            "train_data" : range(len(df_train.columns.tolist()))})

test_columns = pd.DataFrame({'Diff_Column': df_test.columns, 
                             'test_data': range(len(df_test.columns.tolist()))})

col_merged = pd.merge(train_columns, test_columns, on = 'Diff_Column', how = 'left') # compare by column name

diff_columns = []
for i in col_merged[col_merged.isnull().any(axis = 1)]['Diff_Column']:
    diff_columns.append(i)

In [7]:
# Change Column

re_index = list(df_test.columns)

for col in df_train.columns:
    if col in diff_columns :
        re_index.append(col)
        
df_train = df_train.reindex(columns = re_index)

In [8]:
df_train.tail()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,fare_amount
55423851,2014-03-15 03:28:00.00000070,2014-03-15 03:28:00 UTC,-74.005272,40.740027,-73.96328,40.762555,1,14.0
55423852,2009-03-24 20:46:20.0000002,2009-03-24 20:46:20 UTC,-73.957784,40.76553,-73.95164,40.773959,1,4.2
55423853,2011-04-02 22:04:24.0000004,2011-04-02 22:04:24 UTC,-73.970505,40.752325,-73.960537,40.797342,1,14.1
55423854,2011-10-26 05:57:51.0000002,2011-10-26 05:57:51 UTC,-73.980901,40.764629,-73.870605,40.773963,1,28.9
55423855,2014-12-12 11:33:00.00000015,2014-12-12 11:33:00 UTC,-73.969722,40.797668,-73.970885,40.783313,1,7.5


In [None]:
train_info = pd.DataFrame(zip(df_train.columns, df_train.count(),df_train.nunique(), df_train.dtypes))
train_info.columns = ['Column', 'Count', 'Unique', 'Dtype']
test_info = pd.DataFrame(zip(df_test.columns, df_test.count(), df_test.nunique(), df_test.dtypes))
test_info.columns = ['Column', 'Count', 'Unique', 'Dtype']
pd.concat([train_info, test_info], axis = 1, join ='outer',
          keys = ['train info','test info'])#.reindex=[train_info.index]

## Data Processing

In [None]:
df_train.describe()

In [None]:
df_train.describe(include=["O"])

In [None]:
uniques = {col : df_test[col].nunique() for col in df_test.columns}

In [None]:
uniques

In [None]:
category_list = []
# categorization SibSp, Parch
for col in uniques :
    if uniques[col] <= 10 : category_list.append(col)

In [None]:
category_list

In [None]:
for col in category_list :
    df_train[col] = df_train[col].astype('category')
    df_test[col] = df_test[col].astype('category')

In [None]:
def transform_index(data, columns) :
    for col in columns :
        cnt = 0
        dicts = dict()
        cols = data[col]
        for idx in range(len(data[col])):
            if data[col][idx] in dicts : continue
            dicts[data[col][idx]] = cnt
            cnt += 1
        data.drop(col,axis=1,inplace=True)
        lists = []
        for i in range(len(cols)):
            lists.append(dicts[cols[i]])
        data[col] = lists

In [None]:
transform_index(df_train,category_list)
transform_index(df_test,category_list)