# Multiclass classification

## Cleaning data

In [1]:
import pandas as pd
import numpy as np

In [2]:
column_name =['mpg','cylinders','displacement','horsepower','weight','acceleration','year','origin','car_name']
cars = pd.read_table('auto-mpg.data', header=None,delim_whitespace=True)
cars.columns=column_name
print cars.head(1)

    mpg  cylinders  displacement horsepower  weight  acceleration  year  \
0  18.0          8         307.0      130.0  3504.0          12.0    70   

   origin                   car_name  
0       1  chevrolet chevelle malibu  


### Convert categorical variable into indicator variables

In [3]:
dummy_cylinders = pd.get_dummies(cars["cylinders"], prefix="cyl")
cars = pd.concat([cars, dummy_cylinders], axis=1)

dummy_years = pd.get_dummies(cars['year'], prefix='year')
cars = pd.concat([cars,dummy_years], axis=1)
cars = cars.drop("year", axis=1)
cars = cars.drop("cylinders", axis=1)
print(cars.head(5))

cars.dtypes

    mpg  displacement horsepower  weight  acceleration  origin  \
0  18.0         307.0      130.0  3504.0          12.0       1   
1  15.0         350.0      165.0  3693.0          11.5       1   
2  18.0         318.0      150.0  3436.0          11.0       1   
3  16.0         304.0      150.0  3433.0          12.0       1   
4  17.0         302.0      140.0  3449.0          10.5       1   

                    car_name  cyl_3  cyl_4  cyl_5   ...     year_73  year_74  \
0  chevrolet chevelle malibu    0.0    0.0    0.0   ...         0.0      0.0   
1          buick skylark 320    0.0    0.0    0.0   ...         0.0      0.0   
2         plymouth satellite    0.0    0.0    0.0   ...         0.0      0.0   
3              amc rebel sst    0.0    0.0    0.0   ...         0.0      0.0   
4                ford torino    0.0    0.0    0.0   ...         0.0      0.0   

   year_75  year_76  year_77  year_78  year_79  year_80  year_81  year_82  
0      0.0      0.0      0.0      0.0      0.0

mpg             float64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
origin            int64
car_name         object
cyl_3           float64
cyl_4           float64
cyl_5           float64
cyl_6           float64
cyl_8           float64
year_70         float64
year_71         float64
year_72         float64
year_73         float64
year_74         float64
year_75         float64
year_76         float64
year_77         float64
year_78         float64
year_79         float64
year_80         float64
year_81         float64
year_82         float64
dtype: object

In [4]:
cars = cars[cars['horsepower'] != '?']
cars['horsepower'] = cars['horsepower'].astype(float)
cars.dtypes

mpg             float64
displacement    float64
horsepower      float64
weight          float64
acceleration    float64
origin            int64
car_name         object
cyl_3           float64
cyl_4           float64
cyl_5           float64
cyl_6           float64
cyl_8           float64
year_70         float64
year_71         float64
year_72         float64
year_73         float64
year_74         float64
year_75         float64
year_76         float64
year_77         float64
year_78         float64
year_79         float64
year_80         float64
year_81         float64
year_82         float64
dtype: object

### Split data into test/train

In [5]:
# shuffle rows to randomly split into test/train
cars = cars.reset_index()

shuffled_rows = np.random.permutation(cars.index)
shuffled_cars = cars.iloc[shuffled_rows]
shuffled_cars = shuffled_cars.drop('car_name', axis=1)
split=int(len(shuffled_rows)*.7)

train = shuffled_cars.iloc[:split]
test = shuffled_cars.iloc[split:]

train.head(3)

Unnamed: 0,index,mpg,displacement,horsepower,weight,acceleration,origin,cyl_3,cyl_4,cyl_5,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
139,141,29.0,98.0,83.0,2219.0,16.5,2,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,97,18.0,225.0,105.0,3121.0,16.5,1,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
324,326,43.4,90.0,48.0,2335.0,23.7,2,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Multiclass Logistic Regression
### Training for each 'class'

In [9]:
# import and instantiate
from sklearn.linear_model import LogisticRegression

# get the unique tokens for origin
unique_origins = cars['origin'].unique()
unique_origins.sort()
print(unique_origins)

models = {}
cols = [c for c in train.columns if c.startswith("cyl") or c.startswith("year")]

# for every unique values in origin, fit a logistic regression for multiclass classification
for value in unique_origins:
    lr = LogisticRegression()
    lr.fit(train[cols],(train['origin']==value))
    models[value] = lr

[1 2 3]


<font color='red'>
**it is very important to instantiate the model EACH time for EACH class, fit the model and STORE that model**

### Testing for each class

In [10]:
# create empty dataframe
testing_probs = pd.DataFrame(columns=unique_origins)  

for origin in unique_origins:
    X_test = test[cols]
    # Compute probability of observation being in the origin.
    testing_probs[origin] = models[origin].predict_proba(X_test)[:,1]

print(testing_probs.head(3))

          1         2         3
0  0.846078  0.118386  0.049195
1  0.979858  0.020204  0.021376
2  0.442736  0.309040  0.235080


### Predict origin

In [11]:
predicted_origins = testing_probs.idxmax(axis=1)

print(predicted_origins)
print(predicted_origins.value_counts())

0      1
1      1
2      1
3      1
4      2
5      2
6      1
7      1
8      1
9      1
10     2
11     2
12     1
13     1
14     1
15     3
16     1
17     1
18     1
19     2
20     2
21     2
22     1
23     1
24     1
25     1
26     1
27     1
28     1
29     3
      ..
88     1
89     3
90     1
91     1
92     1
93     3
94     1
95     2
96     2
97     1
98     1
99     1
100    1
101    2
102    3
103    2
104    2
105    1
106    1
107    1
108    2
109    1
110    1
111    3
112    1
113    3
114    1
115    1
116    1
117    2
dtype: int64
1    74
2    35
3     9
dtype: int64
