In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')
%matplotlib inline
np.set_printoptions(suppress=True) # Suppress scientific notation where possible

In [2]:
df = pd.read_csv('data/yellow1.csv')

In [7]:
df.head(2)

Unnamed: 0,tip,car_type,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,ratecodeid,pickup_location_id,dropoff_location_id,payment_type,...,improvement_surcharge,total_amount,week_of_month,pickup_hour,dropoff_hour,day,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone
0,1,yellow,2019-01-01 00:46:40,2019-01-01 00:53:20,1,1.5,standard,151,239,credit,...,0.3,9.95,1,0,0,Tuesday,Manhattan,Manhattan Valley,Manhattan,Upper West Side South
1,1,yellow,2019-01-01 00:59:47,2019-01-01 01:18:59,1,2.6,standard,239,246,credit,...,0.3,16.3,1,0,1,Tuesday,Manhattan,Upper West Side South,Manhattan,West Chelsea/Hudson Yards


In [3]:
df.drop(columns = ['pickup_datetime','dropoff_datetime','car_type',
                  'pickup_location_id','dropoff_location_id',
                  'pickup_zone','dropoff_zone'], inplace=True)

In [9]:
df.tip.value_counts()

1    5257054
0     273295
Name: tip, dtype: int64

In [10]:
df.total_amount.sum()

92997587.93000002

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5530349 entries, 0 to 5530348
Data columns (total 17 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   tip                    int64  
 1   passenger_count        int64  
 2   trip_distance          float64
 3   ratecodeid             object 
 4   payment_type           object 
 5   fare_amount            float64
 6   extra                  float64
 7   mta_tax                float64
 8   tolls_amount           float64
 9   improvement_surcharge  float64
 10  total_amount           float64
 11  week_of_month          object 
 12  pickup_hour            object 
 13  dropoff_hour           object 
 14  day                    object 
 15  pickup_borough         object 
 16  dropoff_borough        object 
dtypes: float64(7), int64(2), object(8)
memory usage: 717.3+ MB


In [5]:
df.week_of_month = df.week_of_month.astype('object')
df.pickup_hour = df.pickup_hour.astype('object')
df.dropoff_hour = df.dropoff_hour.astype('object')

In [7]:
object_list = list(df.select_dtypes('object'))
object_list

['ratecodeid',
 'payment_type',
 'week_of_month',
 'pickup_hour',
 'dropoff_hour',
 'day',
 'pickup_borough',
 'dropoff_borough']

In [9]:
#df = pd.get_dummies(data = df, columns = object_list, drop_first = True)
df.shape

(5530349, 84)

In [10]:
X, y = df.iloc[:,1:], df.iloc[:,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state = 7)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = .25, random_state=7)

# Logistic Regression 1
no hyperperameter tuning, just changer solver to saga to handle large data better and increasing max iterations

In [11]:
logreg1 = LogisticRegression()
scaler = StandardScaler()

In [14]:
scaler.fit(X_train)

StandardScaler()

In [15]:
X_train_scaled, X_test_scaled = scaler.transform(X_train), scaler.transform(X_test)

In [16]:
logreg1.fit(X_train_scaled, y_train)
logreg1.score(X_train_scaled, y_train)

KeyboardInterrupt: 