In [1]:
import os
import pandas as pd
from pydataset import data
import env
import seaborn as sns

# import our own acquire module
import acquire

import numpy as np
import matplotlib.pyplot as plt


# import splitting and imputing functions
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from prepare import my_train_test_split

# turn off pink warning boxes
import warnings
warnings.filterwarnings("ignore")

In [2]:
taxis = sns.load_dataset('taxis')

taxis.head()

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough
0,2019-03-23 20:21:09,2019-03-23 20:27:24,1,1.6,7.0,2.15,0.0,12.95,yellow,credit card,Lenox Hill West,UN/Turtle Bay South,Manhattan,Manhattan
1,2019-03-04 16:11:55,2019-03-04 16:19:00,1,0.79,5.0,0.0,0.0,9.3,yellow,cash,Upper West Side South,Upper West Side South,Manhattan,Manhattan
2,2019-03-27 17:53:01,2019-03-27 18:00:25,1,1.37,7.5,2.36,0.0,14.16,yellow,credit card,Alphabet City,West Village,Manhattan,Manhattan
3,2019-03-10 01:23:59,2019-03-10 01:49:51,1,7.7,27.0,6.15,0.0,36.95,yellow,credit card,Hudson Sq,Yorkville West,Manhattan,Manhattan
4,2019-03-30 13:27:42,2019-03-30 13:37:14,3,2.16,9.0,1.1,0.0,13.4,yellow,credit card,Midtown East,Yorkville West,Manhattan,Manhattan


In [3]:
taxis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6433 entries, 0 to 6432
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   pickup           6433 non-null   object 
 1   dropoff          6433 non-null   object 
 2   passengers       6433 non-null   int64  
 3   distance         6433 non-null   float64
 4   fare             6433 non-null   float64
 5   tip              6433 non-null   float64
 6   tolls            6433 non-null   float64
 7   total            6433 non-null   float64
 8   color            6433 non-null   object 
 9   payment          6389 non-null   object 
 10  pickup_zone      6407 non-null   object 
 11  dropoff_zone     6388 non-null   object 
 12  pickup_borough   6407 non-null   object 
 13  dropoff_borough  6388 non-null   object 
dtypes: float64(5), int64(1), object(8)
memory usage: 703.7+ KB


In [4]:
taxis.payment.value_counts()

credit card    4577
cash           1812
Name: payment, dtype: int64

In [5]:
taxis.color.value_counts()

yellow    5451
green      982
Name: color, dtype: int64

In [6]:
taxis.pickup_zone.value_counts()

Midtown Center                         230
Upper East Side South                  211
Penn Station/Madison Sq West           210
Clinton East                           208
Midtown East                           198
                                      ... 
Homecrest                                1
Bedford Park                             1
Riverdale/North Riverdale/Fieldston      1
Hollis                                   1
Hillcrest/Pomonok                        1
Name: pickup_zone, Length: 194, dtype: int64

In [7]:
taxis.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
passengers,6433.0,1.539251,1.203768,0.0,1.0,1.0,2.0,6.0
distance,6433.0,3.024617,3.827867,0.0,0.98,1.64,3.21,36.7
fare,6433.0,13.091073,11.551804,1.0,6.5,9.5,15.0,150.0
tip,6433.0,1.97922,2.44856,0.0,0.0,1.7,2.8,33.2
tolls,6433.0,0.325273,1.415267,0.0,0.0,0.0,0.0,24.02
total,6433.0,18.517794,13.81557,1.3,10.8,14.16,20.3,174.82


In [8]:
## dropping columns

taxis = taxis.drop(['pickup', 'dropoff', 'pickup_zone', 'dropoff_zone'], axis = 1)

In [9]:
taxis.head()

Unnamed: 0,passengers,distance,fare,tip,tolls,total,color,payment,pickup_borough,dropoff_borough
0,1,1.6,7.0,2.15,0.0,12.95,yellow,credit card,Manhattan,Manhattan
1,1,0.79,5.0,0.0,0.0,9.3,yellow,cash,Manhattan,Manhattan
2,1,1.37,7.5,2.36,0.0,14.16,yellow,credit card,Manhattan,Manhattan
3,1,7.7,27.0,6.15,0.0,36.95,yellow,credit card,Manhattan,Manhattan
4,3,2.16,9.0,1.1,0.0,13.4,yellow,credit card,Manhattan,Manhattan


In [10]:
## look for nulls

taxis.isnull().sum()

passengers          0
distance            0
fare                0
tip                 0
tolls               0
total               0
color               0
payment            44
pickup_borough     26
dropoff_borough    45
dtype: int64

In [11]:
taxis = taxis.dropna()

In [12]:
taxis.isnull().sum()

passengers         0
distance           0
fare               0
tip                0
tolls              0
total              0
color              0
payment            0
pickup_borough     0
dropoff_borough    0
dtype: int64

In [13]:
## make dummies for : color, payment, pickup_borough, dropoff_borough

taxi_dummy = pd.get_dummies(taxis[['color', 'payment', 'pickup_borough', 'dropoff_borough']], drop_first = True)
taxi_dummy.head()

Unnamed: 0,color_yellow,payment_credit card,pickup_borough_Brooklyn,pickup_borough_Manhattan,pickup_borough_Queens,dropoff_borough_Brooklyn,dropoff_borough_Manhattan,dropoff_borough_Queens,dropoff_borough_Staten Island
0,1,1,0,1,0,0,1,0,0
1,1,0,0,1,0,0,1,0,0
2,1,1,0,1,0,0,1,0,0
3,1,1,0,1,0,0,1,0,0
4,1,1,0,1,0,0,1,0,0


In [14]:
## concatenate

taxis = pd.concat([taxis, taxi_dummy], axis = 1)
taxis.head()

Unnamed: 0,passengers,distance,fare,tip,tolls,total,color,payment,pickup_borough,dropoff_borough,color_yellow,payment_credit card,pickup_borough_Brooklyn,pickup_borough_Manhattan,pickup_borough_Queens,dropoff_borough_Brooklyn,dropoff_borough_Manhattan,dropoff_borough_Queens,dropoff_borough_Staten Island
0,1,1.6,7.0,2.15,0.0,12.95,yellow,credit card,Manhattan,Manhattan,1,1,0,1,0,0,1,0,0
1,1,0.79,5.0,0.0,0.0,9.3,yellow,cash,Manhattan,Manhattan,1,0,0,1,0,0,1,0,0
2,1,1.37,7.5,2.36,0.0,14.16,yellow,credit card,Manhattan,Manhattan,1,1,0,1,0,0,1,0,0
3,1,7.7,27.0,6.15,0.0,36.95,yellow,credit card,Manhattan,Manhattan,1,1,0,1,0,0,1,0,0
4,3,2.16,9.0,1.1,0.0,13.4,yellow,credit card,Manhattan,Manhattan,1,1,0,1,0,0,1,0,0


In [15]:
## rename boroughs to get rid of space in words

taxis = taxis.rename(columns = {'payment_credit card':'payment_credit_card', 
                                 'dropoff_borough_Staten Island':'dropoff_borough_Staten_Island'})

In [16]:
taxis.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6341 entries, 0 to 6432
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   passengers                     6341 non-null   int64  
 1   distance                       6341 non-null   float64
 2   fare                           6341 non-null   float64
 3   tip                            6341 non-null   float64
 4   tolls                          6341 non-null   float64
 5   total                          6341 non-null   float64
 6   color                          6341 non-null   object 
 7   payment                        6341 non-null   object 
 8   pickup_borough                 6341 non-null   object 
 9   dropoff_borough                6341 non-null   object 
 10  color_yellow                   6341 non-null   uint8  
 11  payment_credit_card            6341 non-null   uint8  
 12  pickup_borough_Brooklyn        6341 non-null   u

In [17]:
## put columns into bins

In [18]:
# num_vars = ['passengers', 'distance', 'fare', 'tip', 'total']

# option for the one below

In [19]:
num_vars = taxis.select_dtypes(include = ['int64', 'float64'])

In [20]:
cat_vars = taxis.select_dtypes(include = ['object', 'uint8'])

# unsigned integer-8 is a Boolean Y/N

In [22]:
def my_train_test_split(df, target):
    
    train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[target])
    
    train, validate = train_test_split(train, test_size=.25, random_state=123, stratify=train[target])
    
    return train, validate, test

In [25]:
## split the data

train, validate, test = my_train_test_split(taxis, target = 'color')

train.shape, validate.shape, test.shape


((3804, 19), (1268, 19), (1269, 19))

In [None]:
## make a loop with all the CATEGORICAL variables

for col in cat_vars:
    print(col)
    print(train[col].value_counts())                        ## value count of each
    print(train[col].value_counts(normalize = True) * 100)      ## pc of each column
    sns.countplot(x = col, data = train)
    plt.show()
    
    

In [None]:
## make a loop with all the NUMERICAL variables

for col in num_vars:
    print(col)
    sns.boxplot(x = col, data = train)
    plt.show()

In [None]:
## DOES DISTANCE AFFECT THE FARE ?

## bivariate

plt.title('Does distance affect fare ?')

## stratify will be what the target is (y-axis variable)

sns.scatterplot(x = 'distance', y = 'fare', data = train)

plt.legend()

In [None]:
sns.barplot(x = 'passengers', y = 'fare', hue = 'payment_credit_card', data = train)

# multivariate