In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression

In [2]:
cars = pd.read_csv("auto.csv")

unique_regions = cars['origin'].unique()

print(unique_regions)

#[1 2 3]

[1 3 2]


In [3]:
#The columns origin, Cylinders and Year, are all categorical.

#We want to add columns that will act like dummies, in this case, one per each unique value in cylinders
dummy_cylinders = pd.get_dummies(cars["cylinders"], prefix="cyl")
#Each new column will have the prefix cyl in them, indicating that they represent cylinders
cars = pd.concat([cars, dummy_cylinders], axis=1)

#We want to add columns that will act like dummies, in this case, one per each unique value in year
dummy_years = pd.get_dummies(cars["year"], prefix="year")
#each new column will have the prefix year in them, indicating that they represent year
cars = pd.concat([cars, dummy_years], axis=1)

#Eliminate the columns where the values came from
drop_columns = ["cylinders","year"]
cars = cars.drop(drop_columns,axis=1)
#verify the changes
print(cars.columns)

Index(['mpg', 'displacement', 'horsepower', 'weight', 'acceleration', 'origin',
       'cyl_3', 'cyl_4', 'cyl_5', 'cyl_6', 'cyl_8', 'year_70', 'year_71',
       'year_72', 'year_73', 'year_74', 'year_75', 'year_76', 'year_77',
       'year_78', 'year_79', 'year_80', 'year_81', 'year_82'],
      dtype='object')


In [4]:
#Shuffle the rows, in order to get an unbiasaed training
shuffled_rows = np.random.permutation(cars.index)
shuffled_cars = cars.iloc[shuffled_rows]

#It will select 70% of the rows for the training
tRows = int(len(shuffled_cars)*0.7)

#It is selecting a certain number of rows for the training
train = shuffled_cars.iloc[:tRows].copy()
#The rest of the rows are used for the testing
test = shuffled_cars.iloc[tRows:].copy()

In [5]:
#We want to train n models, when n in the unique values in the given column
#In this case, the column is origin and the size of unique values is 3
unique_origins = cars["origin"].unique()
#The unique origins will have they values in a descending order
unique_origins.sort()
#Dictionary in which each unique value will contain their corresponding model
models = {}
#The features is the list of columns recently created from the cylinders or the year
features = [c for c in train.columns if c.startswith("cyl") or c.startswith("year")]
#The x values would be the columns from the features. It can be calculated outside the cycle
X_train = train[features]
#Cycle used for the training
for origin in unique_origins:
    #a model is created for the logistric regression
    model = LogisticRegression()
    #Comparing agains the rows that contain the origin value
    y_train = train["origin"] == origin
    #Each model is trained and stored in the dictionary
    model.fit(X_train, y_train)
    models[origin] = model

In [8]:
testing_probs = pd.DataFrame(columns=unique_origins)

print(testing_probs)

# Select testing features.
X_test = test[features] 

for origin in unique_origins:
    # Compute probability of observation being in the origin.
    testing_probs[origin] = models[origin].predict_proba(X_test)[:,1]
    
print(testing_probs)

Empty DataFrame
Columns: [1, 2, 3]
Index: []
            1         2         3
0    0.151510  0.339128  0.522704
1    0.273058  0.382966  0.332317
2    0.239566  0.447108  0.315868
3    0.965861  0.024765  0.023684
4    0.358199  0.277067  0.341899
..        ...       ...       ...
113  0.963996  0.030184  0.018097
114  0.912769  0.058121  0.049025
115  0.957127  0.025087  0.029470
116  0.949300  0.032440  0.027396
117  0.238843  0.324585  0.237084

[118 rows x 3 columns]


In [9]:
predicted_origins = testing_probs.idxmax(axis=1)
print(predicted_origins)

0      3
1      2
2      2
3      1
4      1
      ..
113    1
114    1
115    1
116    1
117    2
Length: 118, dtype: int64
