In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#the target csv file
data = pd.read_csv('/Users/manueltanguma/Downloads/cars.csv', encoding='latin-1')

In [3]:
data.head(5)

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand
0,14.0,8,350,165,4209,12,1972,US.
1,31.9,4,89,71,1925,14,1980,Europe.
2,17.0,8,302,140,3449,11,1971,US.
3,15.0,8,400,150,3761,10,1971,US.
4,30.5,4,98,63,2051,17,1978,US.


In [4]:
#addressing column names to make them more readable
data.columns = [
    'mpg', 'cylinders', 'cubic_inches', 'hp',
    'weight_lbs', 'time-to-60', 'year', 'brand']

In [5]:
data.head(5)

Unnamed: 0,mpg,cylinders,cubic_inches,hp,weight_lbs,time-to-60,year,brand
0,14.0,8,350,165,4209,12,1972,US.
1,31.9,4,89,71,1925,14,1980,Europe.
2,17.0,8,302,140,3449,11,1971,US.
3,15.0,8,400,150,3761,10,1971,US.
4,30.5,4,98,63,2051,17,1978,US.


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261 entries, 0 to 260
Data columns (total 8 columns):
mpg             261 non-null float64
cylinders       261 non-null int64
cubic_inches    261 non-null object
hp              261 non-null int64
weight_lbs      261 non-null object
time-to-60      261 non-null int64
year            261 non-null int64
brand           261 non-null object
dtypes: float64(1), int64(4), object(3)
memory usage: 16.4+ KB


In [7]:
data.shape

(261, 8)

In [8]:
#Create a new function to find missing values, if any:
def num_missing(x):
  return sum(x.isnull())

#Applying per column:
print ("Missing values per column:")
print (data.apply(num_missing, axis=0)) #axis=0 defines that function is to be applied on each column

#Applying per row:
print ("\nMissing values per row:")
print (data.apply(num_missing, axis=1).head(20)) #axis=1 defines that function is to be applied on each row


Missing values per column:
mpg             0
cylinders       0
cubic_inches    0
hp              0
weight_lbs      0
time-to-60      0
year            0
brand           0
dtype: int64

Missing values per row:
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
dtype: int64


In [9]:
data['mpg'] = data['mpg'].apply(np.int64)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261 entries, 0 to 260
Data columns (total 8 columns):
mpg             261 non-null int64
cylinders       261 non-null int64
cubic_inches    261 non-null object
hp              261 non-null int64
weight_lbs      261 non-null object
time-to-60      261 non-null int64
year            261 non-null int64
brand           261 non-null object
dtypes: int64(5), object(3)
memory usage: 16.4+ KB


In [11]:
#converting cubic_inches to int64 from object; has to occur in 2 steps; didn't work like In[9]
data['cubic_inches'] = pd.to_numeric(data['cubic_inches'], errors='coerce').fillna(0)
data['cubic_inches'] = data['cubic_inches'].astype(np.int64)

In [12]:
data['weight_lbs'] = pd.to_numeric(data['weight_lbs'], errors='coerce').fillna(0)
data['weight_lbs'] = data['weight_lbs'].astype(np.int64)

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261 entries, 0 to 260
Data columns (total 8 columns):
mpg             261 non-null int64
cylinders       261 non-null int64
cubic_inches    261 non-null int64
hp              261 non-null int64
weight_lbs      261 non-null int64
time-to-60      261 non-null int64
year            261 non-null int64
brand           261 non-null object
dtypes: int64(7), object(1)
memory usage: 16.4+ KB


In [14]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()
X = data.drop('brand', 1)
Y = data['brand']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

cross_val_score(rfc, X, Y, cv=10)


array([ 0.71428571,  0.88888889,  0.84615385,  0.73076923,  0.73076923,
        0.80769231,  0.80769231,  0.96153846,  0.8       ,  0.8       ])

In [15]:
import time
from datetime import timedelta

start_time = time.time()

from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()
X = data.drop('brand', 1)
Y = data['brand']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

cross_val_score(rfc, X, Y, cv=10)



elapsed_time_secs = time.time() - start_time

msg = "Execution took: %s secs (Wall clock time)" % timedelta(seconds=round(elapsed_time_secs))

print(msg)    



Execution took: 0:00:00 secs (Wall clock time)


In [16]:
from datetime import datetime
start_time = datetime.now()

# do your work here
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()
X = data.drop('brand', 1)
Y = data['brand']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

cross_val_score(rfc, X, Y, cv=20)

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Duration: 0:00:00.276200


In [17]:
from datetime import datetime
start_time = datetime.now()

# do your work here
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()
X = data.drop('brand', 1)
Y = data['brand']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

cross_val_score(rfc, X, Y, cv=10)

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Duration: 0:00:00.145725


In [18]:
#getting rid of some columns
to_go_columns = ['weight_lbs',
 'year']
data.drop(to_go_columns, inplace=True, axis=1)

In [19]:
data.shape

(261, 6)

In [20]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()
X = data.drop('brand', 1)
Y = data['brand']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

cross_val_score(rfc, X, Y, cv=10)


array([ 0.82142857,  0.85185185,  0.73076923,  0.69230769,  0.80769231,
        0.76923077,  0.84615385,  0.80769231,  0.92      ,  0.8       ])

In [21]:
#obtaining new runtime with 2 columns gone
from datetime import datetime
start_time = datetime.now()

# do your work here
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()
X = data.drop('brand', 1)
Y = data['brand']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

cross_val_score(rfc, X, Y, cv=10)

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Duration: 0:00:00.145665
