In [None]:
#Data.csv

**Step 1: Importing the libraries**

In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplot
import seaborn as sns

#for model creation and model evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

**Step 2: Importing dataset**

In [49]:
df=pd.read_csv('/content/Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [50]:
df.shape #10 rows,4 columns

(10, 4)

**Step 3: Handling the missing data**

In [51]:
def rename_cols(dataframe):  # to change to lower case
  cols=[var.lower() for var in dataframe.columns]
  dataframe.columns = cols
  return dataframe 

In [52]:
df = rename_cols(df)
df

Unnamed: 0,country,age,salary,purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    10 non-null     object 
 1   age        9 non-null      float64
 2   salary     9 non-null      float64
 3   purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [54]:
# checking null values
df.isnull().sum()

country      0
age          1
salary       1
purchased    0
dtype: int64

In [55]:
#Age has one nan value -- continous -- applying mean
#imputing mean to age null value
df=df.fillna({'age':df['age'].mean()})

In [56]:
df=df.fillna({'salary':df['salary'].mean()})

In [57]:
df.isnull().sum()

country      0
age          0
salary       0
purchased    0
dtype: int64

In [58]:
df.dtypes # checking datatypes it is correct

country       object
age          float64
salary       float64
purchased     object
dtype: object

In [59]:
#checing duplicates column by column for categorical columns
df['country'].value_counts()

France     4
Spain      3
Germany    3
Name: country, dtype: int64

In [60]:
df['purchased'].value_counts()

No     5
Yes    5
Name: purchased, dtype: int64

In [61]:
df=df.drop_duplicates() # if any rows are duplicated wil be removed

In [62]:
#now cleaned data
df

Unnamed: 0,country,age,salary,purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 4: Encoding categorical data**

In [63]:
#Country - Categorial - Nominal data 
#One hot encoding
df = pd.get_dummies(df,columns = ['country'])

In [64]:
df

Unnamed: 0,age,salary,purchased,country_France,country_Germany,country_Spain
0,44.0,72000.0,No,1,0,0
1,27.0,48000.0,Yes,0,0,1
2,30.0,54000.0,No,0,1,0
3,38.0,61000.0,No,0,0,1
4,40.0,63777.777778,Yes,0,1,0
5,35.0,58000.0,Yes,1,0,0
6,38.777778,52000.0,No,0,0,1
7,48.0,79000.0,Yes,1,0,0
8,50.0,83000.0,No,0,1,0
9,37.0,67000.0,Yes,1,0,0


In [65]:
#Purchased - categorical - Binary - Label encoding
df['purchased'] = df['purchased'].map({'No':0,'Yes':1})

In [66]:
df

Unnamed: 0,age,salary,purchased,country_France,country_Germany,country_Spain
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
4,40.0,63777.777778,1,0,1,0
5,35.0,58000.0,1,1,0,0
6,38.777778,52000.0,0,0,0,1
7,48.0,79000.0,1,1,0,0
8,50.0,83000.0,0,0,1,0
9,37.0,67000.0,1,1,0,0


In [67]:
df.shape  # After encoding the dataset has 10 rows and 6 columns

(10, 6)

**Step 5: Creating a dummy variable**

In [68]:
df

Unnamed: 0,age,salary,purchased,country_France,country_Germany,country_Spain
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
4,40.0,63777.777778,1,0,1,0
5,35.0,58000.0,1,1,0,0
6,38.777778,52000.0,0,0,0,1
7,48.0,79000.0,1,1,0,0
8,50.0,83000.0,0,0,1,0
9,37.0,67000.0,1,1,0,0


**Step 6: Splitting the datasets into training sets and Test sets**

In [75]:
# Assuming salary to be predicted
X=df.drop('salary',axis=1) # expect salary , features are age,country,purchased
y = df['salary']  # target is salary

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4) #80% train , 20% test datas

In [76]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(6, 5) (4, 5) (6,) (4,)


**Step 7: Feature Scaling**

In [78]:
from sklearn.preprocessing import StandardScaler #import
scaler = StandardScaler() #initialize
scaler.fit(X_train)  #fit
X_train_scaled = scaler.transform(X_train) #transform
X_test_scaled  = scaler.transform(X_test) 

In [79]:
print(X_train_scaled)

[[-0.21952852  1.          1.41421356 -1.         -0.4472136 ]
 [-0.5488213   1.          1.41421356 -1.         -0.4472136 ]
 [ 0.27441065  1.         -0.70710678  1.         -0.4472136 ]
 [-1.37205325 -1.         -0.70710678  1.         -0.4472136 ]
 [-0.05488213 -1.         -0.70710678 -1.          2.23606798]
 [ 1.92087455 -1.         -0.70710678  1.         -0.4472136 ]]


In [80]:
print(X_test_scaled)

[[ 0.07317617 -1.         -0.70710678 -1.          2.23606798]
 [ 1.59158177  1.          1.41421356 -1.         -0.4472136 ]
 [-1.86599242  1.         -0.70710678 -1.          2.23606798]
 [ 0.93299621 -1.          1.41421356 -1.         -0.4472136 ]]
