In [1]:
import pandas as pd
import matplotlib.pyplot as plt

columns = ['mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','car name']
cars = pd.read_table('auto-mpg.txt',delim_whitespace=True,names = columns)
print(cars.head(5))
print(cars.tail(5))
cars = cars.dropna()

    mpg  cylinders  displacement  horsepower  weight  acceleration  \
0  18.0        8.0         307.0       130.0  3504.0          12.0   
1  15.0        8.0         350.0       165.0  3693.0          11.5   
2  18.0        8.0         318.0       150.0  3436.0          11.0   
3  16.0        8.0         304.0       150.0  3433.0          12.0   
4  17.0        8.0         302.0       140.0  3449.0          10.5   

   model year  origin                   car name  
0        70.0     1.0  chevrolet chevelle malibu  
1        70.0     1.0          buick skylark 320  
2        70.0     1.0         plymouth satellite  
3        70.0     1.0              amc rebel sst  
4        70.0     1.0                ford torino  
      mpg  cylinders  displacement  horsepower  weight  acceleration  \
401  27.0        4.0         140.0        86.0  2790.0          15.6   
402  44.0        4.0          97.0        52.0  2130.0          24.6   
403  32.0        4.0         135.0        84.0  2295.0   

In [2]:
dummy_cylinders = pd.get_dummies(cars['cylinders'],prefix = 'cy1')
#print(dummy_cylinders)
cars = pd.concat([cars,dummy_cylinders],axis = 1)
dummy_years = pd.get_dummies(cars['model year'],prefix = 'year')
#print(dummy_years)
cars = pd.concat([cars,dummy_years],axis=1)
cars = cars.drop('model year',axis = 1)
cars = cars.drop('cylinders',axis = 1)
print(cars.head())

    mpg  displacement  horsepower  weight  acceleration  origin  \
0  18.0         307.0       130.0  3504.0          12.0     1.0   
1  15.0         350.0       165.0  3693.0          11.5     1.0   
2  18.0         318.0       150.0  3436.0          11.0     1.0   
3  16.0         304.0       150.0  3433.0          12.0     1.0   
4  17.0         302.0       140.0  3449.0          10.5     1.0   

                    car name  cy1_3.0  cy1_4.0  cy1_5.0    ...      year_73.0  \
0  chevrolet chevelle malibu        0        0        0    ...              0   
1          buick skylark 320        0        0        0    ...              0   
2         plymouth satellite        0        0        0    ...              0   
3              amc rebel sst        0        0        0    ...              0   
4                ford torino        0        0        0    ...              0   

   year_74.0  year_75.0  year_76.0  year_77.0  year_78.0  year_79.0  \
0          0          0          0     

In [8]:
import numpy as np
shuffled_rows = np.random.permutation(list(range(0,len(cars.index))))
shuffled_cars = cars.iloc[shuffled_rows]
highest_train_row = int(cars.shape[0]*.70)
train = shuffled_cars.iloc[0:highest_train_row]
test = shuffled_cars.iloc[highest_train_row:]

In [17]:
from sklearn.linear_model import LogisticRegression

unique_origins = cars['origin'].unique()
unique_origins.sort()

models = {}
features = [c for c in train.columns if c.startswith('cyl') or c.startswith('year')]

for origin in unique_origins:
    model = LogisticRegression()
    
    X_train = train[features]
    y_train = train['origin'] == origin
    
    model.fit(X_train,y_train)
    models[origin] = model

In [20]:
testing_probs = pd.DataFrame(columns = unique_origins)
print(testing_probs)
for origin in unique_origins:
    X_test = test[features]
    testing_probs[origin] = models[origin].predict_proba(X_test)[:,1]
print(testing_probs)

Empty DataFrame
Columns: [1.0, 2.0, 3.0]
Index: []
          1.0       2.0       3.0
0    0.632606  0.120836  0.255325
1    0.477532  0.129127  0.398020
2    0.724654  0.202293  0.091880
3    0.724654  0.202293  0.091880
4    0.724654  0.202293  0.091880
5    0.643507  0.241043  0.124743
6    0.607933  0.184327  0.209753
7    0.632606  0.120836  0.255325
8    0.648082  0.290948  0.077710
9    0.607933  0.184327  0.209753
10   0.643507  0.241043  0.124743
11   0.594467  0.259994  0.149368
12   0.764236  0.164830  0.091880
13   0.670086  0.170295  0.166338
14   0.607933  0.184327  0.209753
15   0.648082  0.290948  0.077710
16   0.632606  0.120836  0.255325
17   0.594467  0.259994  0.149368
18   0.724654  0.202293  0.091880
19   0.764236  0.164830  0.091880
20   0.648082  0.290948  0.077710
21   0.705731  0.156983  0.151366
22   0.764236  0.164830  0.091880
23   0.594467  0.259994  0.149368
24   0.477532  0.129127  0.398020
25   0.594467  0.259994  0.149368
26   0.705731  0.156983  0.1513