# Libaries

In [1]:
import csv

import numpy as np
import os.path
import pandas as pd
import logging
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Logger

In [2]:
logging.basicConfig(filename="info.log",
                        format="%(asctime)s-%(message)s", filemode='a+')
log = logging.getLogger()
log.setLevel(logging.DEBUG)

# Function

In [3]:
def read_input(file_loc, sheet_name=""):
    '''
        Function description: To read different types of files such as csv, xlsx,txt

                Parameters:
                        file_loc(Str): A decimal integer
                        sheet_name(Str): Consist of sheet name for xlsx files ("" - default value)

                Returns:
                        pandas data frame
        '''

    name, extension = os.path.splitext(file_loc)
    if extension == ".csv":
        data = pd.read_csv(file_loc)
    elif extension == ".xlsx":
        data = pd.read_excel(file_loc)
    elif extension == ".txt":
        data = pd.read_table(file_loc)
    else:
        print("file type not supported")

    # print(data.head())
    #logging.basicConfig(filename="info.log",format="%(asctime)s-%(message)s", filemode='a+')
    #log = logging.getLogger()
    #log.setLevel(logging.DEBUG)
    log.info("Data has been read file type: %s", extension)
    return data

In [4]:
def clean_data(data, dep_var, var_to_remove):

    """
        This function cleans and arranges the data; removes completely empty columns and columns with 0 variance.

        Parameters:
        data (df): input data to be cleaned

        dep_var (str): name of the dependent variable

        var_to_remove (list): variable(s) which are to be removed from the dataset

        Returns:
        cleaned dataset.

    """

    dep_var_series = data[dep_var]
    to_drop = [dep_var] + var_to_remove
    data.drop(to_drop, inplace=True, axis=1)

    data = pd.concat([dep_var_series, data], axis=1)
    nan_value = float("NaN")
    data.replace("", nan_value, inplace=True)

    data.dropna(how='all', axis=1, inplace=True)

    data_var = data.var(axis=0)

    for (columnName, columnData) in data_var.iteritems():
        if columnData == 0:
            data.drop(columnName, inplace=True, axis=1)

    log.info("dataset is cleansed.")

    return data



In [5]:
def remove_duplicate(data):
    '''
        Function description: To remove duplicate rows in the dataset.

                Parameters:
                        data: pandas data frame after cleaning

                Returns:
                        pandas data frame
        '''

    data = data.drop_duplicates(keep='first', inplace=False)

    #logging.basicConfig(filename="info.log",format="%(asctime)s-%(message)s", filemode='a+')
    #log = logging.getLogger()
    #log.setLevel(logging.DEBUG)
    log.info("Duplicate rows have been removed")

    return data

In [6]:
def fill_rate(data):

    """
        This function calculates the fill rate of columns.

        Parameters:
        df (df): input dataset with a set of columns

        Returns:
        doesn't return anything, saves the fill rate computed csv file in the output folder.

    """

    data = data.notna().sum() / len(data) * 100

    output_dir = f"../outputs"
    isdir = os.path.isdir(output_dir)
    if not isdir:
        os.makedirs(output_dir)

    fill_rate_dir = f"../outputs/fill_rate"
    isdir = os.path.isdir(fill_rate_dir)
    if not isdir:
        os.makedirs(fill_rate_dir)

    data.to_csv('./Output/fill_rate.csv', header = COL_NAME, index=False)

    log.info("fill percentages of columns is calculated and stored.")


In [7]:
def split_train_test(df, split_ratio):
    '''
    Function description: to split the data set to train and test sets based on the split ratio and combining them
    back into a dataframe

                    Parameters:
                            df: pandas dataframe for the dataset
                            split_ratio(Float): Consists of the ratio for splitting train and tes(80:20)

                    Returns:
                            pandas data frame containing the train and test dataset separately.
            '''

    train, test = train_test_split(df, train_size=split_ratio, random_state=0)

    #df = pd.concat([test.assign(ind="test"), train.assign(ind="train")])

    #logging.basicConfig(filename="info.log",format="%(asctime)s-%(message)s", filemode='a+')
    #log = logging.getLogger()
    #log.setLevel(logging.DEBUG)
    log.info("Data has been split into train and test, ratio of split: %f", split_ratio)
    return train,test



In [8]:
def save_uid(train, test):

    """
        Maps the unique ids of data points.

        Parameters:
        train (df): input the train split dataset

        test (df): input the test split dataset

        Returns:
        doesn't return anything, saves the train and test computed csv file in the output folder.

    """

    train_uid_list = []
    test_uid_list = []

    for i, row in train.iterrows():
        train_uid_list.append(i)

    train_data = {'Sno': [i for i in range(1, len(train.index)+1)],
                  'train_uid': train_uid_list}

    train_uid = pd.DataFrame(train_data)

    for i, row in test.iterrows():
        test_uid_list.append(i)

    test_data = {'Sno': [i for i in range(1, len(test.index) + 1)],
                 'test_uid': test_uid_list}

    test_uid = pd.DataFrame(test_data)

    output_dir = f"./Output"
    isdir = os.path.isdir(output_dir)
    if not isdir:
        os.makedirs(output_dir)

    fill_rate_dir = f"./Output/"
    isdir = os.path.isdir(fill_rate_dir)
    if not isdir:
        os.makedirs(fill_rate_dir)

    train_uid.to_csv('./Output/Train_id_mapping.csv', index=False)
    test_uid.to_csv('./Output/Test_id_mapping.csv', index=False)

    log.info("train and test id mapping csv files have been generated")

In [9]:
def missing_value_imputation(data):
    '''
        Function description: to fill the missing numerical and categorical variables with median and mode respectively.

                        Parameters:
                                data(pandas dataframe): Train dataset

                        Returns:
                                pandas data frame containing the updated train dataset and dictionary containing the median and mode values for each column.
                '''

    col = list(data)
    dict ={}
    for i in col:
        if data.dtypes[i] != np.object:
            x=data[i].median()
            data.fillna(x,inplace=True)
            dict[i]=x
        else:
            x=data[i].mode()
            #print(x[0])
            data[i].fillna(x[0],inplace=True)
            dict[i]=x[0]
    #print(dict)
    #print(data)
    missing_imputation = open(".\Output\missing_value_imputation.csv", "w")
    writer = csv.DictWriter(missing_imputation,fieldnames=["Col", "Val"])
    writer.writeheader()
    writer = csv.writer(missing_imputation)
    for key, value in dict.items():
        writer.writerow([key, value])
    missing_imputation.close()

    #logging.basicConfig(filename="info.log",format="%(asctime)s-%(message)s", filemode='a+')
    #log = logging.getLogger()
    #log.setLevel(logging.DEBUG)
    log.info("missing values of train data has been filled with median and mode and written to file")
    return data,dict


In [10]:
def test_missing_value_imputation(data,dict):
    '''
            Function description: to fill the missing numerical and categorical variables with median and mode respectively.

                            Parameters:
                                    data(pandas dataframe): Test dataset
                                    dict: Dictionary which consist of median and mode values for the respective columns.

                            Returns:
                                    pandas data frame containing the updated test dataset.
                    '''

    col = list(data)
    for i in col:
            x=dict[i]
            data.fillna(x,inplace=True)


    #logging.basicConfig(filename="info.log",format="%(asctime)s-%(message)s", filemode='a+')
    #log = logging.getLogger()
    #log.setLevel(logging.DEBUG)
    log.info("missing values of test data has been filled with median and mode ")
    return data


In [11]:
def standardize(data, target_var):

    """
        This function performs standardization on the set of external variable.

        Parameters:
        data (df): input dataset including the target variable in first column.

        target_var (str):  input the name of the target variable.

        Returns:
        a data set with standardized variables.

    """

    compute_stats_dict = {'type': ['mean', 'std']}

    for (col, col_data) in data.iteritems():
        if not data.dtypes[col] == 'object':
            compute_stats_dict[col] = [col_data.mean(), col_data.std()]
    compute_stats = pd.DataFrame(compute_stats_dict)
    # compute_stats.drop(['HotUnicode', 'index'], inplace=True, axis=1)

    compute_stats_dir = f"./Output"
    isdir = os.path.isdir(compute_stats_dir)
    if not isdir:
        os.makedirs(compute_stats_dir)

    compute_stats.to_csv('./Output/compute_stats.csv', index=False)

    for col, col_data in data.iteritems():
        if not col == target_var:
            data[col] = stats.zscore(data[col])

    log.info("train dataset's variables are standardized")

    return data

In [12]:
def test_standardize(data, target_var):

    """
        This function performs standardization on the set of external variable.

        Parameters:
        data (df): input test dataset including the target variable in the first column.

        target_var (str):  input the name of the target variable.

        Returns:
        a data set with standardized variables.

    """
    try:
        data.drop('index', axis=1, inplace=True)
    except KeyError:
        pass

    get_stats = pd.read_csv('./Output/compute_stats.csv')

    for (col, col_data) in data.iteritems():
        if not col == target_var:
            try:
                mean = get_stats[col][0]
                std_dev = get_stats[col][1]
            except KeyError:
                pass
            else:
                data[col] = (data[col] - mean) / std_dev

    log.info("test dataset's variables are standardized")

    return data

In [13]:
def train_one_hot_encoding(data):
    '''
        Function description: to perform one hot encoding for multiple categorical variables.

                        Parameters:
                                data(pandas dataframe): Train dataset

                        Returns:
                                pandas data frame containing the updated Train dataset with one hot encoded values.
                '''

    col = list(data)
    #dict = {}
    for i in col:
        if data.dtypes[i] == np.object:
             one_hot_encoded_data = pd.get_dummies(data, columns=[i])
             print(one_hot_encoded_data)

    one_hot_encoding = open(".\Output\Train_one_hot_encoding.csv", "w")

    one_hot_encoded_data.to_csv(".\Output\Train_one_hot_encoding.csv", index=False)
    one_hot_encoding.close()

    #logging.basicConfig(filename="info.log",format="%(asctime)s-%(message)s", filemode='a+')
    #log = logging.getLogger()
    #log.setLevel(logging.DEBUG)
    log.info("One hot encoding has been performed for train data")
    return one_hot_encoded_data



In [14]:
def test_one_hot_encoding(data):
    '''
        Function description: to perform one hot encoding for multiple categorical variables.

                        Parameters:
                                data(pandas dataframe): Test dataset

                        Returns:
                                pandas data frame containing the updated Test dataset with one hot encoded values.
                '''

    col = list(data)
    #dict = {}
    for i in col:
        if data.dtypes[i] == np.object:
             one_hot_encoded_data = pd.get_dummies(data, columns=[i])
             print(one_hot_encoded_data)

    one_hot_encoding = open(".\Output\Test_one_hot_encoding.csv", "w")

    one_hot_encoded_data.to_csv(".\Output\Test_one_hot_encoding.csv", index=False)
    one_hot_encoding.close()

    #logging.basicConfig(filename="info.log",format="%(asctime)s-%(message)s", filemode='a+')
    #log = logging.getLogger()
    #log.setLevel(logging.DEBUG)
    log.info("One hot encoding has been performed for test data")
    return one_hot_encoded_data


In [15]:
def col_rename(train, target_var):

    """
        This function is used to rename the column names of a data frame by removing special characters and spaces thus making it usable in multiple models.

        Parameters:
        train (df): input train dataset including the target variable in first column.

        target_var (str): input the name of the target variable.

        Returns:
        The function returns the data set with renamed columns

    """
    print(train)
    original_cols = []
    #train.drop('index', axis=1, inplace=True)

    train.rename({target_var: target_var.replace("_", ".")}, axis=1, inplace=True)

    for (col, col_data) in train.iteritems():
        original_cols.append(col)

    renamed_cols = [f"x{n}" for n in range(1, len(original_cols)+1)]
    train.set_axis(renamed_cols, axis=1, inplace=True)

    col_rename_dict = {'Original_cols': original_cols,
                       'Renamed_cols': renamed_cols}

    column_rename_dir = f"./Output"
    isdir = os.path.isdir(column_rename_dir)
    if not isdir:
        os.makedirs(column_rename_dir)

    col_rename_df = pd.DataFrame(col_rename_dict)
    col_rename_df.to_csv('./Output/column_mapping.csv', index=False)

    print("train dataset's columns are renamed")

    return train


In [16]:
def test_col_rename(test, target_actual):

    """
       This function is used to rename the column names of a data frame by removing special characters and spaces thus making it usable in multiple models.

       Parameters:
       train (df): input test dataset including the target variable in first column.

       target_actual(int): input 0(false) or 1(true) to decide whether to rename the target variable or not.

       Returns:
       The function returns the data set with renamed columns

    """

    try:
        test.drop('index', axis=1, inplace=True)
    except KeyError:
        pass

    get_col_mapping = pd.read_csv("./Output/column_mapping.csv")
    rename_test_cols = get_col_mapping['Renamed_cols'].tolist()

    if target_actual == 0:
        rename_test_cols[0] = get_col_mapping['Original_cols'].tolist()[0]

    test.set_axis(rename_test_cols, axis=1, inplace=True)

    print("test dataset's columns are renamed")

    return test


# Driver

In [17]:
path = r"C:\Users\Kshitij Sharma\PycharmProjects\Kedro\code\get-started\data\01_raw\iris.csv"
data = read_input(path)

dep_var = 'species'
del_var = []
data=clean_data(data, dep_var, del_var)

data=remove_duplicate(data)

COL_NAME = ['fill_rate']
fill_rate(data)

train,test = split_train_test(data, 0.8)

save_uid(train, test)

sub_val={}
train,sub_val = missing_value_imputation(train)
test = test_missing_value_imputation(test,sub_val)

train=standardize(train, 'species')
test=test_standardize(test, 'species')

train = train_one_hot_encoding(train)
test = test_one_hot_encoding(test)

col_rename(test, "Hot_Unicode")
test_col_rename(train, 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


     sepal_length  sepal_width  petal_length  petal_width  species_setosa  \
34      -0.817574     2.061700     -1.298147    -1.446127               1   
81      -0.207753    -0.994589     -0.164730    -0.288789               0   
19      -0.939539     0.839184     -1.354818    -1.188941               1   
130      0.645997    -0.587084      1.025358     1.125734               0   
66      -0.329717    -0.383331     -0.108059     0.096990               0   
48      -0.939539     1.450442     -1.241476    -1.317534               1   
2       -1.183467    -0.179579     -1.354818    -1.317534               1   
88       1.011890     0.024174      0.515321     0.354176               0   
11      -1.183467     0.024174     -1.298147    -1.446127               1   
128      0.402069    -0.587084      0.571991     0.739955               0   
112      0.767962     0.227927      0.742004     0.997141               0   
65       0.280104    -0.383331      0.515321     0.225583               0   

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7
34,-0.817574,2.061700,-1.298147,-1.446127,1,0,0
81,-0.207753,-0.994589,-0.164730,-0.288789,0,1,0
19,-0.939539,0.839184,-1.354818,-1.188941,1,0,0
130,0.645997,-0.587084,1.025358,1.125734,0,0,1
66,-0.329717,-0.383331,-0.108059,0.096990,0,1,0
48,-0.939539,1.450442,-1.241476,-1.317534,1,0,0
2,-1.183467,-0.179579,-1.354818,-1.317534,1,0,0
88,1.011890,0.024174,0.515321,0.354176,0,1,0
11,-1.183467,0.024174,-1.298147,-1.446127,1,0,0
128,0.402069,-0.587084,0.571991,0.739955,0,0,1
