In [1]:
import pandas as pd
import numpy as np

# class data preprocessing
class Data_preprocessing:
    
    # fuction for display csv file
    def display_data(self):
        
        # Exception handling for file 
        try:
            # for user input file name
            self.file = input("\n Enter file name:-")
            
            # read csv file
            self.df = pd.read_csv(self.file)
            # print("\n Read file",df.head(15))
            print(self.df)
        
        # if file not found the error
        except OSError as e:
            # print exception
            print("File not found")
   

    """Handling missing data"""
    def handling_missing_data(self):
        
        # check data type of all variable
        print("\n",self.df.dtypes)
        
        # check for null value
        print("\n\n",self.df.isnull().sum())

        # replacing missing values with mean

        print("\n\n",self.df['Age'].replace(np.NaN,self.df['Age'].mean(), inplace = True))
        print("\n\n",self.df['Salary'].replace(np.NaN,self.df['Age'].mean(), inplace = True))
        # print file
        print(self.df)
        
        
    """Handling categorical data"""
    
    def handling_categorical_data(self):
        
        # create dummy variable
        Country_dummies = pd.get_dummies(self.df['Country'],prefix='Country')
        
        # concate with dataframe
        my_file = pd.concat([self.df,Country_dummies], axis =1)
        print(my_file)
        
        # create dummy Variable for Purchased
        self.df['Purchased_dummies'] = self.df.Purchased.map({ 'Yes':1,'No':0 })
        print(self.df)

        
    """Split the dataset into training set and test set"""
    
    def split_dataset_into_training_test(self):
        
        train_per = int(0.70*len(self.df))
        test_per = len(self.df)-train_per
        X_train_set = self.df.head(train_per)
        Y_train_set = self.df.tail(test_per)
        
        # display data
        print("\n\n Training set")
        print(X_train_set)
        print("\n\n Test data set")
        print(Y_train_set)
           
    """Feature scaling"""  
    def feature_scaling(self):
        # Simple feature scaling
        self.df["Age"] = self.df["Age"]/self.df["Age"].max()
        self.df["Salary"]= self.df["Salary"]/self.df["Salary"].max()
        print("Simple feature scaling")
        print(self.df)
        
        # Min-Max
        self.df["Age"] = (self.df["Age"]-self.df["Age"].min())/(self.df["Age"].max()-self.df["Age"].min())
        self.df["Salary"] = (self.df["Salary"]-self.df["Salary"].min())/(self.df["Salary"].max()-self.df["Salary"].min())
        print("Min-max")
        print(self.df)
        
        # Z-Score
        self.df["Age"]= (self.df["Age"]-self.df["Age"].mean())/self.df["Age"].std()
        self.df["Salary"]= (self.df["Salary"]-self.df["Salary"].mean())/self.df["Salary"].std()
        print("Z-Score")
        print(self.df)
        
        

object_class = Data_preprocessing()


In [2]:
object_class.display_data()




 Enter file name:-data_preprocessing.csv
   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


In [3]:

object_class.handling_missing_data()



 Country       object
Age          float64
Salary       float64
Purchased     object
dtype: object


 Country      0
Age          1
Salary       1
Purchased    0
dtype: int64


 None


 None
   Country        Age        Salary Purchased
0   France  44.000000  72000.000000        No
1    Spain  27.000000  48000.000000       Yes
2  Germany  30.000000  54000.000000        No
3    Spain  38.000000  61000.000000        No
4  Germany  40.000000     38.777778       Yes
5   France  35.000000  58000.000000       Yes
6    Spain  38.777778  52000.000000        No
7   France  48.000000  79000.000000       Yes
8  Germany  50.000000  83000.000000        No
9   France  37.000000  67000.000000       Yes


In [4]:

object_class.handling_categorical_data()

   Country        Age        Salary Purchased  Country_France  \
0   France  44.000000  72000.000000        No               1   
1    Spain  27.000000  48000.000000       Yes               0   
2  Germany  30.000000  54000.000000        No               0   
3    Spain  38.000000  61000.000000        No               0   
4  Germany  40.000000     38.777778       Yes               0   
5   France  35.000000  58000.000000       Yes               1   
6    Spain  38.777778  52000.000000        No               0   
7   France  48.000000  79000.000000       Yes               1   
8  Germany  50.000000  83000.000000        No               0   
9   France  37.000000  67000.000000       Yes               1   

   Country_Germany  Country_Spain  
0                0              0  
1                0              1  
2                1              0  
3                0              1  
4                1              0  
5                0              0  
6                0              

In [5]:

object_class.split_dataset_into_training_test()



 Training set
   Country        Age        Salary Purchased  Purchased_dummies
0   France  44.000000  72000.000000        No                  0
1    Spain  27.000000  48000.000000       Yes                  1
2  Germany  30.000000  54000.000000        No                  0
3    Spain  38.000000  61000.000000        No                  0
4  Germany  40.000000     38.777778       Yes                  1
5   France  35.000000  58000.000000       Yes                  1
6    Spain  38.777778  52000.000000        No                  0


 Test data set
   Country   Age   Salary Purchased  Purchased_dummies
7   France  48.0  79000.0       Yes                  1
8  Germany  50.0  83000.0        No                  0
9   France  37.0  67000.0       Yes                  1


In [6]:
object_class.feature_scaling()

Simple feature scaling
   Country       Age    Salary Purchased  Purchased_dummies
0   France  0.880000  0.867470        No                  0
1    Spain  0.540000  0.578313       Yes                  1
2  Germany  0.600000  0.650602        No                  0
3    Spain  0.760000  0.734940        No                  0
4  Germany  0.800000  0.000467       Yes                  1
5   France  0.700000  0.698795       Yes                  1
6    Spain  0.775556  0.626506        No                  0
7   France  0.960000  0.951807       Yes                  1
8  Germany  1.000000  1.000000        No                  0
9   France  0.740000  0.807229       Yes                  1
Min-max
   Country       Age    Salary Purchased  Purchased_dummies
0   France  0.739130  0.867408        No                  0
1    Spain  0.000000  0.578116       Yes                  1
2  Germany  0.130435  0.650439        No                  0
3    Spain  0.478261  0.734816        No                  0
4  German