
# Libraries & Settings

In [None]:
import sys
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# Acquire Data

In [None]:
train_path = "./Data/train.csv"
test_path = "./Data/test.csv"

In [None]:
df_test = pd.read_csv(test_path)

In [None]:
test_length = len(df_test)

In [None]:
with open(train_path) as file:
    rows = len(file.readlines())
print(f"Train Rows = {rows}")

In [None]:
df_train_tmp = pd.read_csv(train_path, nrows=5)
df_train_tmp.head()

In [None]:
df_train_tmp.info()

In [None]:
traintypes = {'fare_amount': 'float32',
              'pickup_datetime': 'str', 
              'pickup_longitude': 'float32',
              'pickup_latitude': 'float32',
              'dropoff_longitude': 'float32',
              'dropoff_latitude': 'float32',
              'passenger_count': 'int64'}

testtypes = { 'pickup_datetime': 'str', 
              'pickup_longitude': 'float32',
              'pickup_latitude': 'float32',
              'dropoff_longitude': 'float32',
              'dropoff_latitude': 'float32',
              'passenger_count': 'int64'}

In [None]:
#chunk_size = 55423857
chunk_size = 3000000

In [None]:
df_test.head()

In [None]:
def create_hour_month(df_path, types, sizes) :
    lists = []
    for tmp in tqdm(pd.read_csv(df_path, usecols=list(types.keys()), dtype=types, chunksize=sizes)):
        tmp['pickup_datetime'] = tmp['pickup_datetime'].str.slice(0, 16)
        tmp['pickup_datetime'] = pd.to_datetime(tmp['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
        lists.append(tmp)
    df = pd.concat(lists)
    return df

In [None]:
df_train = create_hour_month(train_path, traintypes, chunk_size) # Very slow...

In [None]:
df_test = create_hour_month(test_path, testtypes, test_length)

In [None]:
df_train.head()

In [None]:
df_test.head()

### Comparing Columns

In [None]:
df_train.info()
print('_'*40)
df_test.info()

In [None]:

# find different column
train_columns = pd.DataFrame({"Diff_Column": df_train.columns,
                            "train_data" : range(len(df_train.columns.tolist()))})

test_columns = pd.DataFrame({'Diff_Column': df_test.columns, 
                             'test_data': range(len(df_test.columns.tolist()))})

col_merged = pd.merge(train_columns, test_columns, on = 'Diff_Column', how = 'left') # compare by column name

diff_columns = []
for i in col_merged[col_merged.isnull().any(axis = 1)]['Diff_Column']:
    diff_columns.append(i)

In [None]:
# Change Column

re_index = list(df_test.columns)

for col in df_train.columns:
    if col in diff_columns :
        re_index.append(col)
        
df_train = df_train.reindex(columns = re_index)

In [None]:
train_info = pd.DataFrame(zip(df_train.columns, df_train.count(),df_train.nunique(), df_train.dtypes))
train_info.columns = ['Column', 'Count', 'Unique', 'Dtype']
test_info = pd.DataFrame(zip(df_test.columns, df_test.count(), df_test.nunique(), df_test.dtypes))
test_info.columns = ['Column', 'Count', 'Unique', 'Dtype']
pd.concat([train_info, test_info], axis = 1, join ='outer',
          keys = ['train info','test info'])#.reindex=[train_info.index]

## Data Processing

In [None]:
df_train.describe()

### Missing Value

In [None]:
round((df_train.isnull().sum()/len(df_train)*10000).sort_values(ascending=False),1)

In [None]:
round((df_test.isnull().sum()/len(df_test)*100).sort_values(ascending=False),1)

In [None]:
print("Train Null data\n")
print(df_train.isna().sum())
print("\nTest Null data\n")
print(df_test.isna().sum())

In [None]:
df_train = df_train.dropna(axis=0) 

In [None]:
print("Train Null data\n")
print(df_train.isna().sum())
print("\nTest Null data\n")
print(df_test.isna().sum())

## EDA

### Heatmap

In [None]:
#st = df_train.apply(LabelEncoder().fit_transform)

In [None]:
"""
sns.set(color_codes=True)
plt.figure(figsize=(18, 18))

sns.heatmap(st.astype(float).corr(), 
            linewidths=0.2, 
            square=True, 
            linecolor='white', 
            annot=True,
            cmap="YlGnBu"
           )
plt.show()
"""

In [None]:
df_train["month"] = df_train["pickup_datetime"].dt.month
df_test["month"] = df_test["pickup_datetime"].dt.month

In [None]:
"""
df_train["month"] = df_train.apply(lambda x : 1 if x["pickup_datetime"].month == 12 or x["pickup_datetime"].month < 3
                                 else 2 if 3 <= x["pickup_datetime"].month < 6 
                                        else 3 if 6 <= x["pickup_datetime"].month <ㅅ 9 
                                 else 4, axis=1)

df_test["month"] = df_test.apply(lambda x : 1 if x["pickup_datetime"].month == 12 or x["pickup_datetime"].month < 3
                                 else 2 if 3 <= x["pickup_datetime"].month < 6 
                                        else 3 if 6 <= x["pickup_datetime"].month < 9 
                                 else 4, axis=1)
"""

In [None]:
df_train["hour"] = df_train["pickup_datetime"].dt.hour
df_test["hour"] = df_test["pickup_datetime"].dt.hour

In [None]:
df_train.drop(["pickup_datetime"], axis=1, inplace=True)
df_test.drop(["pickup_datetime"], axis=1, inplace=True)

In [None]:
df_test["longitude_diff"] = (abs(df_test["pickup_longitude"] - df_test["dropoff_longitude"])*1000000).astype(int)
df_train["longitude_diff"] = (abs(df_train["pickup_longitude"] - df_train["dropoff_longitude"])*1000000).astype(int)
df_test["latitude_diff"] = (abs(df_test["pickup_latitude"] - df_test["dropoff_latitude"])*1000000).astype(int)
df_train["latitude_diff"] = (abs(df_train["pickup_latitude"] - df_train["dropoff_latitude"])*1000000).astype(int)

In [None]:
df_test.drop(["pickup_longitude","dropoff_longitude","pickup_latitude","dropoff_latitude"], axis=1, inplace=True)
df_train.drop(["pickup_longitude","dropoff_longitude","pickup_latitude","dropoff_latitude"], axis=1, inplace=True)

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_train["fare_amount"] = (df_train["fare_amount"]*10000).astype(int)

### Train_test Split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
dataset_x= df_train.copy()
dataset_y = dataset_x.pop('fare_amount')

In [None]:
def regression_data():
    X_train, X_test, y_train, y_test = train_test_split(dataset_x, dataset_y,)
    return X_train, y_train, X_test, y_test


def visualize_regression(y_tes, y_pred):
  plt.rcParams["figure.figsize"] = (10,4)
  plt.rcParams['lines.linewidth'] = 4
  plt.rcParams['lines.color'] = 'r'
  plt.rcParams['axes.grid'] = True
  
  plt.scatter(np.arange(len(y_pred)), y_pred, label='blue')
  plt.scatter(np.arange(len(y_pred)), y_tes, color='red', label='true')

  plt.title('Scattered plot')
  print('RMSE:', mean_squared_error(y_tes, y_pred) ** 0.5)
  print('R2_score :', r2_score(y_tes, y_pred) ** 0.5)

In [None]:
X_train, y_train, X_test, y_test = regression_data()
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=2000)
random_forest.fit(X_train, y_train)

In [None]:
y_pred = random_forest.predict(X_test)

In [None]:
visualize_regression(y_test, y_pred)

### LightGBM

In [None]:
import lightgbm as lgb

In [None]:
X_train, y_train, X_test, y_test = regression_data()

gbm = lgb.LGBMRegressor(mc='0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0', 
                        learning_rate=0.07, 
                        n_estimators=64, 
                        seed=0)
gbm.fit(X_train, y_train, eval_metric='l2', 
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=5, verbose=False)

print('Best Iteration:', gbm.best_iteration_)
print()
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)

visualize_regression(y_test, y_pred)

In [None]:
res = decision_tree.predict(df_test.values)