In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')
%matplotlib inline
np.set_printoptions(suppress=True) # Suppress scientific notation where possible

In [17]:
yellowdf = pd.read_csv('data/yellow1.csv')

In [3]:
yellowdf.payment_type.value_counts()

credit       5485999
no charge      33186
Dispute        11164
Name: payment_type, dtype: int64

In [18]:
greendf = pd.read_csv('data/green1.csv')

In [5]:
yellowdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5530349 entries, 0 to 5530348
Data columns (total 24 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   tip                    int64  
 1   car_type               object 
 2   pickup_datetime        object 
 3   dropoff_datetime       object 
 4   passenger_count        int64  
 5   trip_distance          float64
 6   ratecodeid             object 
 7   pickup_location_id     int64  
 8   dropoff_location_id    int64  
 9   payment_type           object 
 10  fare_amount            float64
 11  extra                  float64
 12  mta_tax                float64
 13  tolls_amount           float64
 14  improvement_surcharge  float64
 15  total_amount           float64
 16  week_of_month          int64  
 17  pickup_hour            int64  
 18  dropoff_hour           int64  
 19  day                    object 
 20  pickup_borough         object 
 21  pickup_zone            object 
 22  dropoff_borough   

In [6]:
greendf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393050 entries, 0 to 393049
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   tip                    393050 non-null  int64  
 1   car_type               393050 non-null  object 
 2   pickup_datetime        393050 non-null  object 
 3   dropoff_datetime       393050 non-null  object 
 4   ratecodeid             393050 non-null  object 
 5   pickup_location_id     393050 non-null  int64  
 6   dropoff_location_id    393050 non-null  int64  
 7   passenger_count        393050 non-null  int64  
 8   trip_distance          393050 non-null  float64
 9   fare_amount            393050 non-null  float64
 10  extra                  393050 non-null  float64
 11  mta_tax                393050 non-null  float64
 12  tolls_amount           393050 non-null  float64
 13  improvement_surcharge  393050 non-null  float64
 14  total_amount           393050 non-nu

## Correcting column order

Columns were stored differently from source, correcting the order to concatenate

In [19]:
cols = list(greendf.columns)

In [20]:
cols.insert(4, cols.pop(cols.index('passenger_count')))

In [21]:
greendf = greendf[cols]

In [22]:
cols.insert(5, cols.pop(cols.index('trip_distance')))

In [23]:
greendf = greendf[cols]

In [24]:
cols.insert(9, cols.pop(cols.index('payment_type')))
greendf = greendf[cols]

In [25]:
list(yellowdf.columns) == list(greendf.columns)

True

In [26]:
df = pd.concat([yellowdf, greendf])

In [18]:
df.shape

(5923399, 24)

In [19]:
yellowdf.shape

(5530349, 24)

In [20]:
greendf.shape

(393050, 24)

In [21]:
df.tip.value_counts()

1    5473524
0     449875
Name: tip, dtype: int64

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8060793 entries, 0 to 393049
Data columns (total 24 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   tip                    int64  
 1   car_type               object 
 2   pickup_datetime        object 
 3   dropoff_datetime       object 
 4   passenger_count        float64
 5   trip_distance          float64
 6   ratecodeid             object 
 7   pickup_location_id     float64
 8   dropoff_location_id    float64
 9   payment_type           object 
 10  fare_amount            float64
 11  extra                  float64
 12  mta_tax                float64
 13  tolls_amount           float64
 14  improvement_surcharge  float64
 15  total_amount           float64
 16  week_of_month          int64  
 17  pickup_hour            int64  
 18  dropoff_hour           int64  
 19  day                    object 
 20  pickup_borough         object 
 21  pickup_zone            object 
 22  dropoff_borough    

In [27]:
df.to_csv(r'/users/michaelharnett/desktop/metis/projects/taxi_tip_classification_metis4/data/totaldf.csv', index=False)

# Creating Dummy Columns

In [22]:
## first erasing earlier dataframes, as the first time I ran this my computer crashed.
yellowdf = 0
greendf = 0

In [None]:
len(df.pickup_zone.value_counts())

In [None]:
'''
After the second attempt, and crash, I will be removing the pickup and dropoff zones. Independently using these for dummies worked. can try to create the dummies off
smaller dataframes and merge together, but the yellowdf had over 600 columns on its own. May be to big. Borogh information will have to be enough.
'''

In [None]:
df.drop(columns = ['pickup_zone','dropoff_zone'])

In [23]:
object_list = list(df.select_dtypes('object').columns)

In [None]:
object_list

In [None]:
finaldf = pd.get_dummies(data = df, columns = object_list, drop_first=True)

In [None]:
'''
Still did not help. Looks like yellow it is!
'''

## Dummies one at a time

instead of dummifying all columns at once, lets see if we can do them individually and merge

In [5]:
df = pd.read_csv('data/totaldf.csv')

In [9]:
object_list = list(df.select_dtypes('object').columns)
object_list

['car_type',
 'pickup_datetime',
 'dropoff_datetime',
 'ratecodeid',
 'payment_type',
 'day',
 'pickup_borough',
 'pickup_zone',
 'dropoff_borough',
 'dropoff_zone']

In [7]:
dum1 = pd.get_dummies(data = df.car_type, drop_first = True)
dum1

Unnamed: 0,yellow
0,1
1,1
2,1
3,1
4,1
...,...
5923394,0
5923395,0
5923396,0
5923397,0


In [16]:
pd.merge(df, dum1, left_index=True, right_index=True)

Unnamed: 0.1,Unnamed: 0,tip,car_type,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,ratecodeid,pickup_location_id,dropoff_location_id,...,total_amount,week_of_month,pickup_hour,dropoff_hour,day,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone,yellow
0,0,1,yellow,2019-01-01 00:46:40,2019-01-01 00:53:20,1,1.50,standard,151,239,...,9.95,1,0,0,Tuesday,Manhattan,Manhattan Valley,Manhattan,Upper West Side South,1
1,1,1,yellow,2019-01-01 00:59:47,2019-01-01 01:18:59,1,2.60,standard,239,246,...,16.30,1,0,1,Tuesday,Manhattan,Upper West Side South,Manhattan,West Chelsea/Hudson Yards,1
2,2,1,yellow,2019-01-01 00:21:28,2019-01-01 00:28:37,1,1.30,standard,163,229,...,9.05,1,0,0,Tuesday,Manhattan,Midtown North,Manhattan,Sutton Place/Turtle Bay North,1
3,3,1,yellow,2019-01-01 00:32:01,2019-01-01 00:45:39,1,3.70,standard,229,7,...,18.50,1,0,0,Tuesday,Manhattan,Sutton Place/Turtle Bay North,Queens,Astoria,1
4,4,1,yellow,2019-01-01 00:57:32,2019-01-01 01:09:32,2,2.10,standard,141,234,...,13.00,1,0,1,Tuesday,Manhattan,Lenox Hill West,Manhattan,Union Sq,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5923394,393045,1,green,2019-01-31 23:08:27,2019-01-31 23:22:59,1,3.33,standard,255,226,...,18.39,5,23,23,Thursday,Brooklyn,Williamsburg (North Side),Queens,Sunnyside,0
5923395,393046,1,green,2019-01-31 23:21:26,2019-01-31 23:23:05,1,0.72,standard,75,151,...,6.36,5,23,23,Thursday,Manhattan,East Harlem South,Manhattan,Manhattan Valley,0
5923396,393047,0,green,2019-01-31 23:30:05,2019-01-31 23:36:14,1,1.75,standard,75,238,...,8.30,5,23,23,Thursday,Manhattan,East Harlem South,Manhattan,Upper West Side North,0
5923397,393048,1,green,2019-01-31 23:59:58,2019-02-01 00:04:18,1,0.57,standard,74,74,...,7.30,5,23,0,Thursday,Manhattan,East Harlem North,Manhattan,East Harlem North,0
