In [10]:
import pandas as pd
import numpy as np
cars = pd.read_csv("auto.csv")
print(cars.head())
unique_regions = cars["origin"].unique()
print(unique_regions)

    mpg  cylinders  displacement  horsepower  weight  acceleration  year  \
0  18.0          8         307.0       130.0  3504.0          12.0    70   
1  15.0          8         350.0       165.0  3693.0          11.5    70   
2  18.0          8         318.0       150.0  3436.0          11.0    70   
3  16.0          8         304.0       150.0  3433.0          12.0    70   
4  17.0          8         302.0       140.0  3449.0          10.5    70   

   origin  
0       1  
1       1  
2       1  
3       1  
4       1  
[1 3 2]


In [11]:
dummy_cylinders = pd.get_dummies(cars['cylinders'], prefix='cyl')
cars = pd.concat([cars, dummy_cylinders], axis=1)
dummy_year = pd.get_dummies(cars['year'], prefix='year')
cars = pd.concat([cars, dummy_year], axis=1)
cars = cars.drop(['year', 'cylinders'], axis=1)
cars.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cyl_3,cyl_4,cyl_5,cyl_6,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
0,18.0,307.0,130.0,3504.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15.0,350.0,165.0,3693.0,11.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,18.0,318.0,150.0,3436.0,11.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16.0,304.0,150.0,3433.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17.0,302.0,140.0,3449.0,10.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
shuffled_rows = np.random.permutation(cars.index)
shuffled_cars = cars.iloc[shuffled_rows]
highest_train_row = int(cars.shape[0] * .70)
train = shuffled_cars.iloc[0:highest_train_row]
test = shuffled_cars.iloc[highest_train_row:]

## One VS All: multiple binary classification

In [16]:
from sklearn.linear_model import LogisticRegression
unique_origins = cars['origin'].unique()
features = [c for c in cars.columns if c.startswith('cyl') or c.startswith('year')]

models = {}
for origin in unique_origins:
    X = cars[features]
    y = (cars['origin'] == origin)
    model = LogisticRegression()
    model.fit(X, y)
    models[origin] = model

In [18]:
testing_probs = pd.DataFrame(columns=unique_origins)

Unnamed: 0,1,3,2


In [40]:
# Testing the model
for origin in unique_origins:
    X_test = test[features]
    testing_probs[origin] = models[origin].predict_proba(X_test)[:,1]

In [44]:
testing_probs.head()

Unnamed: 0,1,3,2
0,0.964759,0.019735,0.029907
1,0.987199,0.013007,0.014411
2,0.203386,0.484871,0.321464
3,0.275093,0.267376,0.455397
4,0.563179,0.32649,0.121482


In [48]:
# df.idxmax() : return a Series where each value corresponds 
# to the column or where the maximum value occurs for that observation.

predicted_origins = testing_probs.idxmax(axis=1)
predicted_origins

0      1
1      1
2      3
3      2
4      1
5      1
6      3
7      1
8      1
9      3
10     1
11     3
12     3
13     1
14     2
15     1
16     3
17     1
18     1
19     2
20     3
21     1
22     1
23     2
24     1
25     2
26     2
27     1
28     3
29     1
      ..
88     2
89     1
90     1
91     1
92     1
93     1
94     1
95     1
96     3
97     1
98     1
99     1
100    3
101    1
102    1
103    1
104    2
105    3
106    3
107    1
108    2
109    1
110    1
111    1
112    1
113    1
114    3
115    1
116    1
117    1
Length: 118, dtype: int64