# import the libraries

In [1]:

import numpy as py

import matplotlib.pyplot as plt

import pandas as pd 

# Import the dataset

In [2]:
df = pd.read_csv('DataSet.csv')


# Exploring Data

In [3]:
df


Unnamed: 0,Developer,Age,Salary,Married
0,Karachi,42.0,78000.0,No
1,Lahore,32.0,48000.0,Yes
2,Karachi,36.0,60000.0,No
3,Multan,41.0,68000.0,No
4,Lahore,42.0,,Yes
5,Multan,43.0,59000.0,Yes
6,Karachi,,59000.0,No
7,Lahore,44.0,79000.0,Yes
8,Multan,52.0,99000.0,No
9,Lahore,32.0,69000.0,Yes


In [4]:
df.shape

(10, 4)

In [5]:
df.columns

Index(['Developer', 'Age', 'Salary', 'Married'], dtype='object')

In [6]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Developer  10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Married    10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes


# Data Cleaning

Checking Null Values

In [37]:
df.isnull().any()

Developer    False
Age          False
Salary       False
Married      False
dtype: bool

In [8]:
print(df.isnull().sum())

Developer    0
Age          1
Salary       1
Married      0
dtype: int64


In [9]:
print('\n Missing Data percentage(%):')
print(df.isnull().sum()/df.count()*100)


 Missing Data percentage(%):
Developer     0.000000
Age          11.111111
Salary       11.111111
Married       0.000000
dtype: float64


Filling Null Values(Mean/Median/Most_Frequent)

In [10]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

df[['Age', 'Salary']] = imputer.fit_transform(df[['Age', 'Salary']])

In [11]:
df['Age']

0    42.0
1    32.0
2    36.0
3    41.0
4    42.0
5    43.0
6    42.0
7    44.0
8    52.0
9    32.0
Name: Age, dtype: float64

In [12]:
df['Salary']

0    78000.0
1    48000.0
2    60000.0
3    68000.0
4    68000.0
5    59000.0
6    59000.0
7    79000.0
8    99000.0
9    69000.0
Name: Salary, dtype: float64

In [13]:
df.isnull().any()

Developer    False
Age          False
Salary       False
Married      False
dtype: bool

# Separate dependent and independent feature¶

In [14]:
y = df['Married']
x = df.drop('Married', axis=1)

In [15]:
x.shape

(10, 3)

In [16]:
y.shape

(10,)

# Separating categorical & Numerical columns

In [17]:
df_cat=x.select_dtypes("object")
df_num=x.select_dtypes("number")

# Encoding of categorical data

Label Encoding

In [18]:
from sklearn.preprocessing import LabelEncoder

#labelencoder = LabelEncoder()

#for col in df_cat.columns:
 #   df_cat[col] = labelencoder.fit_transform(df_cat[col])


In [19]:
#df_cat

Dummies Method

In [20]:
df_cat_encoded=pd.get_dummies(df_cat,columns=df_cat.columns.to_list(), dtype=int)
df_cat_encoded.head()

Unnamed: 0,Developer_Karachi,Developer_Lahore,Developer_Multan
0,1,0,0
1,0,1,0
2,1,0,0
3,0,0,1
4,0,1,0


In [22]:
# Concatenating encoded categorical data with numerical data
x = pd.concat([df_cat_encoded, df_num], axis=1, join='outer')
x.head()

Unnamed: 0,Developer_Karachi,Developer_Lahore,Developer_Multan,Age,Salary
0,1,0,0,42.0,78000.0
1,0,1,0,32.0,48000.0
2,1,0,0,36.0,60000.0
3,0,0,1,41.0,68000.0
4,0,1,0,42.0,68000.0


In [23]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Developer_Karachi  10 non-null     int32  
 1   Developer_Lahore   10 non-null     int32  
 2   Developer_Multan   10 non-null     int32  
 3   Age                10 non-null     float64
 4   Salary             10 non-null     float64
dtypes: float64(2), int32(3)
memory usage: 412.0 bytes


In [24]:
x.shape

(10, 5)

# Training and Testing Data (divide the data into two part)

In [25]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test =train_test_split(x,y,test_size=0.2, random_state=0)

In [26]:
x_train.shape

(8, 5)

In [27]:
y_train.shape

(8,)

In [28]:
x_test.shape

(2, 5)

In [29]:
y_test.shape

(2,)

# Standard and fit the data for better predication


In [30]:
from sklearn.preprocessing import StandardScaler

sc_x=StandardScaler()

x_train=sc_x.fit_transform(x_train)
x_test=sc_x.fit_transform(x_test)




In [34]:
x_train

array([[-0.57735027,  1.        , -0.57735027,  0.49468474,  0.20628425],
       [-0.57735027,  1.        , -0.57735027, -1.7039141 ,  0.30942637],
       [-0.57735027,  1.        , -0.57735027, -1.7039141 , -1.85655824],
       [ 1.73205081, -1.        , -0.57735027,  0.49468474, -0.72199487],
       [-0.57735027,  1.        , -0.57735027,  0.93440451,  1.34084762],
       [-0.57735027, -1.        ,  1.73205081,  0.27482485,  0.20628425],
       [ 1.73205081, -1.        , -0.57735027,  0.49468474,  1.2377055 ],
       [-0.57735027, -1.        ,  1.73205081,  0.71454462, -0.72199487]])

In [35]:
x_test

array([[ 1.,  0., -1., -1., -1.],
       [-1.,  0.,  1.,  1.,  1.]])