#### Data Preprocessing

##### Step 1: Importing libraries

In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

##### Step 2: Importing dataset

In [12]:
data = pd.read_csv("Datasets/Data.csv")
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


##### Step 3: Checking basic statistical info

In [13]:
data.shape

(10, 4)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [15]:
data.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


##### Step 4: Handling missing values

In [16]:
data["Age"] = data["Age"].fillna(data["Age"].mean())
data["Salary"] = data["Salary"].fillna(data["Salary"].mean())
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        10 non-null     float64
 2   Salary     10 non-null     float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


##### Step 5: Encoding categorical variables

In [18]:
data["Country"].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [19]:
data["Purchased"].unique()

array(['No', 'Yes'], dtype=object)

In [20]:
data["Country"].replace(['France','Spain','Germany'], [0,1,2], inplace=True)
data["Purchased"].replace(['No', 'Yes'], [0,1], inplace=True)
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,1,27.0,48000.0,1
2,2,30.0,54000.0,0
3,1,38.0,61000.0,0
4,2,40.0,63777.777778,1


##### Step 6: Splitting the dataset into training set and testing set

In [33]:
x_train, x_test, y_train, y_test = train_test_split(data, data["Purchased"], test_size = 0.2, random_state=42)

##### Step 7: Feature scaling/Normalizing the features.

In [34]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(x_train)
X_test = sc_X.fit_transform(x_test)

##### Step 8: Model building

In [35]:
logreg= LogisticRegression(multi_class='multinomial', solver="newton-cg",   C=1e10)
logreg.fit(x_train,y_train)
logreg.score(x_test, y_test)



1.0