In [366]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None

In [368]:
cars =  pd.read_csv('auto.csv')

In [370]:
cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [372]:
unique_regions = cars['origin'].unique()
unique_regions.sort()

In [374]:
print(unique_regions)

[1 2 3]


In [376]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    float64
 5   acceleration  392 non-null    float64
 6   year          392 non-null    int64  
 7   origin        392 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 24.6 KB


In [378]:
dummy_cylinders = pd.get_dummies(cars['cylinders'], prefix='cyl')

In [380]:
cars = pd.concat([cars, dummy_cylinders], axis=1)

In [382]:
cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,cyl_3,cyl_4,cyl_5,cyl_6,cyl_8
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,False,False,False,False,True
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,False,False,False,False,True
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,False,False,False,False,True
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,False,False,False,False,True
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,False,False,False,False,True


In [384]:
dummy_years = pd.get_dummies(cars['year'], prefix='year')
cars = pd.concat([cars, dummy_years], axis=1)

In [386]:
cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,cyl_3,cyl_4,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,False,False,...,False,False,False,False,False,False,False,False,False,False
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,False,False,...,False,False,False,False,False,False,False,False,False,False
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,False,False,...,False,False,False,False,False,False,False,False,False,False
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,False,False,...,False,False,False,False,False,False,False,False,False,False
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,False,False,...,False,False,False,False,False,False,False,False,False,False


In [388]:
cars_dummied = cars.drop(['cylinders', 'year'], axis=1)

In [390]:
shuffled_rows = np.random.permutation(cars_dummied.index)
shuffled_cars = cars_dummied.iloc[shuffled_rows]

In [392]:
train = shuffled_cars[:int(len(shuffled_cars)*0.7)]
test = shuffled_cars[int(len(shuffled_cars)*0.7):]

In [394]:
from sklearn.linear_model import LogisticRegression

models = {}
features = [c for c in train.columns if c.startswith('cyl') or c.startswith('year')]

for region in unique_regions:
    model = LogisticRegression(solver='liblinear')
    x_train = train[features]
    y_train = train['origin'] == region

    model.fit(x_train, y_train)
    models[region] = model

In [396]:
testing_probs = pd.DataFrame(columns=unique_regions)

In [398]:
for region in unique_regions:
    x_test = test[features]
    predictions = models[region].predict_proba(x_test)[:,1]
    testing_probs[region] = predictions

testing_probs

Unnamed: 0,1,2,3
0,0.959134,0.040470,0.021356
1,0.832395,0.082563,0.104225
2,0.284430,0.252879,0.463115
3,0.797773,0.128896,0.079037
4,0.954038,0.045157,0.019532
...,...,...,...
113,0.953148,0.017648,0.055728
114,0.284430,0.252879,0.463115
115,0.346430,0.298102,0.324515
116,0.245242,0.263948,0.496549


In [400]:
predicted_origins = testing_probs.idxmax(axis=1)

In [404]:
test['predicted_origin'] = predicted_origins.to_list()

In [406]:
test.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cyl_3,cyl_4,cyl_5,cyl_6,...,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82,predicted_origin
164,13.0,302.0,129.0,3169.0,12.0,1,False,False,False,False,...,False,True,False,False,False,False,False,False,False,1
223,17.5,250.0,110.0,3520.0,16.4,1,False,False,False,True,...,False,False,False,True,False,False,False,False,False,1
345,34.1,91.0,68.0,1985.0,16.0,3,False,True,False,False,...,False,False,False,False,False,False,False,True,False,3
151,18.0,250.0,105.0,3459.0,16.0,1,False,False,False,True,...,False,True,False,False,False,False,False,False,False,1
7,14.0,440.0,215.0,4312.0,8.5,1,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1


In [410]:
test['good_prediction'] = test['origin'] == test['predicted_origin']

In [412]:
test.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cyl_3,cyl_4,cyl_5,cyl_6,...,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82,predicted_origin,good_prediction
164,13.0,302.0,129.0,3169.0,12.0,1,False,False,False,False,...,True,False,False,False,False,False,False,False,1,True
223,17.5,250.0,110.0,3520.0,16.4,1,False,False,False,True,...,False,False,True,False,False,False,False,False,1,True
345,34.1,91.0,68.0,1985.0,16.0,3,False,True,False,False,...,False,False,False,False,False,False,True,False,3,True
151,18.0,250.0,105.0,3459.0,16.0,1,False,False,False,True,...,True,False,False,False,False,False,False,False,1,True
7,14.0,440.0,215.0,4312.0,8.5,1,False,False,False,False,...,False,False,False,False,False,False,False,False,1,True


In [414]:
tpr = len(test['good_prediction'] == True) / len(test)

In [418]:
print(tpr)

1.0
