In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.datasets import load_boston
from sklearn.datasets import load_iris
import warnings
warnings.filterwarnings("ignore")
seeds = 42
np.random.seed(42)

  import pandas.util.testing as tm


# <span class="mark">Correlation Diagram</span>
<img src = 'https://upload.wikimedia.org/wikipedia/commons/thumb/d/d4/Correlation_examples2.svg/600px-Correlation_examples2.svg.png' > 

# <span class="mark">Pearson Correlation Formula</span> : 
<img src='https://wikimedia.org/api/rest_v1/media/math/render/svg/f76ccfa7c2ed7f5b085115086107bbe25d329cec'>

In [21]:
def load_sample_data(boston,iris,normal,**kwargs):
    """ Attributes:
            boston = True; For loading Boston Dataset with MEDV as the target and other columns as the input dataset.
            iris = True; For loading Iris Dataset with Class as the target and other columns as the input dataset.
            normal = True; For loading Regression Dataset with specified samples, features, informative_features
            Kwargs = n_samples, n_features, n_informative , None
        
        Defaults:
            n_samples=100, n_features=100, n_informative=10
    """
    if boston:
        x = load_boston()
        df = pd.DataFrame(x.data, columns = x.feature_names)
        df["MEDV"] = x.target
        X = df.drop("MEDV",1)   
        y = df["MEDV"]
        
    elif iris:
        iris = load_iris()
        X = iris.data
        y = iris.target
        class_names = iris.target_names
        X = pd.DataFrame(X,columns=iris.feature_names)
        y = pd.DataFrame(y,columns=['Class'])
#         pd.concat([X,y],axis=1).to_csv("Trial_Dataset.csv")
    elif normal:
        if kwargs.items():
            params = [v for k,v in kwargs.items()]
            X, y = make_regression(n_samples=params[0], n_features=params[1], n_informative=params[2])
            X = pd.DataFrame(X)
            y = pd.DataFrame(y,columns=['Class'])
        else:
            X, y = make_regression(n_samples=100, n_features=100, n_informative=10)
            X = pd.DataFrame(X)
            y = pd.DataFrame(y,columns=['Class'])
    else:
        print("No dataset Loaded")
    return X,y

def cal_pearson_coef(X,show_map,save_map):
    """ Attributes:
            X = dataframe for calculating squared pearson matrix.
            show_map = plot the pearson correlation heatmap via matplotlib and sns.
            save_map = save the pearson correlation heat map in png format.
    """
    pears_cor = X.corr().abs()
    plt.figure(figsize=(12,10))
    sns.heatmap(pears_cor, annot=True, cmap=plt.cm.Reds)
    if save_map:
        plt.savefig('Pearson_Correlation_Heatmap.png')
    if show_map:
        plt.show()
    pears_cor.to_csv('Pearson_Correlation.csv',index=False)
    plt.close()
    return pears_cor

def drop_corr_cols(X, THRESH=0.85,show_map=True,save_map=True):
    """ Attributes:
        X = Dataset of features (without labels)
        THRESH = Threshold to neglect dependent features and make less complex model.
        show_map = plot the pearson correlation heatmap via matplotlib and sns.
        save_map = save the pearson correlation heat map in png format.

        Defaults:
            THRESH = 0.85
            show_map = True
            save_map = True
    """
    n_col = len(X.columns)
    corr_matrix = cal_pearson_coef(X,show_map,save_map)
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),
                                      k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > THRESH)]
    print("Following columns are dropped:",to_drop)
    # Drop Marked Features
    X_N = X.drop(columns = to_drop)
    if len(to_drop) <= n_col // 2:
        return X_N
    elif abs(len(to_drop) - n_col) == 0:
        return X
    elif len(to_drop) > n_col // 2:
        return X_N

def drop_multiple_col(col_names_list, df): 
    """Drop multiple columns based on their column names 
    Attributes: List of column names, df
    """
    df.drop(col_names_list, axis=1, inplace=True)
    return df

def change_dtypes(col_int, col_float, df): 
    """Changing dtypes to save memory
    Attributes: List of column names (int, float), df
    """
    if len(col_int) > 0:
        for c in col_int:
            df[c] = df[c].astype('int32')
    if len(col_float) > 0:
        for c in col_float:
            df[c] = df[c].astype('float32')
    return df

def check_missing_data(df):
    """check for any missing data in the df (display in descending order)"""
    if df.isnull().sum().sort_values(ascending=False).sum() == 0:
        return 0
    else: 
        return df.isnull().sum().sort_values(ascending=False)
    
def remove_col_str(df,str_cols):
    if len(str_cols) > 0:
        for c in str_cols:
            # remove a portion of string in a dataframe column - col_1
            df[c].replace('\n', '', regex=True, inplace=True)
            # remove all the characters after &# (including &#) for column - col_1
            df[c].replace(' &#.*', '', regex=True, inplace=True)
    return df

def remove_col_white_space(df,col):
    for i in col:
        df[i] = df[i].str.lstrip()
    return df

def convert_str_datetime(t): 
    return pd.to_datetime(t,dayfirst=True,format='%Y-%m-%d %H:%M:%S.%f')
    
def clean_dataframe(df,**kwargs):
    """cleaning dataframe : 
    df : Dataframe to be cleaned.
    ne_cols : Primary Columns which can't be empty/NA.
    date_cols: Date columns to be formatted to particular format.
    str_cols: String columns
    drop_cols: Useless Columns 
    col_int: Integer Columns
    col_float: Float Columns
    """
    ne_cols,date_cols,str_cols,col_int,col_float,drop_cols = kwargs['ne_cols'],kwargs['date_cols'],kwargs['str_cols'],kwargs['col_int'],kwargs['col_float'],kwargs['drop_cols']
    if len(str_cols) > 0:
        df = remove_col_str(df,str_cols)
    if len(drop_cols) > 0:
        df =  drop_multiple_col(drop_cols,df)
    if len(ne_cols) > 0:
        df = df.dropna(subset=ne_cols)
    if len(str_cols) > 0:
        df = remove_col_white_space(df,str_cols)
    if len(date_cols) > 0:
        for dt in date_cols:
            df[dt] = df[dt].apply(convert_str_datetime)
    print("Missing Data:",check_missing_data(df))
    if len(col_int) > 0 or len(col_float) > 0:
        df = change_dtypes(col_int, col_float, df)
    return df

def specify_cols(df):
    print("Specify String,Not null,Integer,Float and Date Columns with following flags: \n 1. I: for int \n 2. S: for string \n 3. F: float \n 4. D: Date \n 5.N: Columns should not be Null.")
    ne_cols = []
    date_cols = []
    str_cols = []
    col_int = []
    col_float = []
    drop_cols = []
    for c in df.columns:
        print(f'For column {c} : ')
        chc = input('Input Column Type: ')
        if chc.lower() == 'i':
            col_int.append(c)
        elif chc.lower() == 's':
            str_cols.append(c)
        elif chc.lower() == 'f':
            col_float.append(c)
        elif chc.lower() == 'd':
            date_cols.append(c)
        elif chc.lower() == 'n':
            ne_cols.append(c)
        else:
            print('Wrong choice')
            continue
    print("Specify Columns to be dropped: \n 1. Yes: To drop column from Dataset \n 2. No: To keep column in dataset")
    for c in df.columns:
        chc = input(f'Do you want to drop {c}: Yes/No: ')
        if chc.lower() == 'yes' or chc.lower() == 'y':
            drop_cols.append(c)
        elif chc.lower() == 'no' or chc.lower() == 'n':
            pass
        else:
            print('Wrong choice')
            continue
    return ne_cols,date_cols,str_cols,col_int,col_float,drop_cols

In [31]:
def main():
    """ Choose any one default load_data or load new file (.csv or .xlsx):
        1. Create new Regression dataset
        2. Iris
        3. Boston
    """
    chc = str(input('Do you want default Dataset? : Yes/No - '))
    if chc.lower() == 'yes' or chc.lower() == 'y':
        print("------ Default Dataset -----")
        print("Chose any one load_data options : \n 1. Create new Regression dataset \n 2. Iris \n 3. Boston")
        val = int(input('Please enter choice: '))
        if val == 1:
            X,y = load_sample_data(boston=False,iris=False,normal=True,n_samples=100, n_features=10, n_informative=5)
        elif val == 2:
            X,y = load_sample_data(boston=False,iris=True,normal=False)
        else:
            X,y = load_sample_data(boston=True,iris=False,normal=False)

    elif chc.lower() == 'no' or chc.lower() == 'n':
        print("------ Non Default Dataset -----")
        f_name = str(input(r'Please enter filename with directory without start and end quotes: For eg : C:\Users\ABC\Downloads\Kaggle\Data_0.013_20200807_040358.csv'))
        assert os.path.exists(f_name), "File doesn't exist at - "+f_name+" or Directory doesn't exist. Please check!!"
        _ext = os.path.splitext(f_name)[1]
        if _ext == '.csv' or _ext =='.txt':
            df = pd.read_csv(os.path.join(f_name))
        elif _ext == '.xlsx':
            sname = str(input('Please enter sheet_name: '))
            if sname:
                df = pd.read_excel(os.path.join(f_name),sheet_name = sname)
            df = pd.read_excel(os.path.join(f_name))
        else:
            print("File has unidentified extension.")
            exit()
        print('Data Loaded Successfully! \n ----------------------------------------- \n')
        chc = input('Does you data consist labels? Yes/No')
        if chc.lower() == 'no' or chc.lower() == 'n':
            X = df
            ne_cols,date_cols,str_cols,col_int,col_float,drop_cols = specify_cols(X)
            X = clean_dataframe(X,ne_cols=ne_cols,date_cols=date_cols,str_cols=str_cols,
                                drop_cols=drop_cols,col_int=col_int,col_float=col_float)

        elif chc.lower() == 'yes' or chc.lower() == 'y':
            target_col = input('Please provide Target Column in the given file - ')
            y = df[target_col]
            X = df.drop(columns= [target_col]).reset_index(drop=True)
            ne_cols,date_cols,str_cols,col_int,col_float,drop_cols = specify_cols(X)
            X = clean_dataframe(X,ne_cols=ne_cols,date_cols=date_cols,str_cols=str_cols,
                                drop_cols=drop_cols,col_int=col_int,col_float=col_float)

        else:
            print("No such option available.")
            exit()
    else:
        print("No such option available.")
        exit()
    
    THRESH = float(input('Please enter threshold for Pearson Coeffecient to reject the correalated variables: '))
    chc1 = input('Do you want to save correlation graphs? Yes/No- ')
    chc = input('Do you want to show correlation graphs? Yes/No- ')
    if chc.lower() == 'no' or chc.lower() == 'n':
        show_flag = False
    elif chc.lower() == 'yes' or chc.lower() == 'y':
        show_flag = True
    else:
        print("No such option available.")
        exit()
    if chc1.lower() == 'no' or chc1.lower() == 'n':
        save_flag = False
    elif chc1.lower() == 'yes' or chc1.lower() == 'y':
        save_flag = True
    else:
        print("No such option available.")
        exit()
    print("Earlier Variables were: ",X.columns)
    new_df = drop_corr_cols(X,THRESH,show_flag,save_flag)
    print('Selected Variables after rejecting via pearson correlation are :',new_df.columns)

In [32]:
# C:\Users\AW639XJ\Downloads\Nachi's Interview code\Trial_Dataset.csv
if __name__=='__main__':
    main()

Do you want default Dataset? : Yes/No - n
------ Non Default Dataset -----
Please enter filename with directory without start and end quotes: For eg : C:\Users\ABC\Downloads\Kaggle\Data_0.013_20200807_040358.csvC:\Users\AW639XJ\Downloads\Nachi's Interview code\Trial_Dataset.csv
Data Loaded Successfully! 
 ----------------------------------------- 

Does you data consist labels? Yes/Noy
Please provide Target Column in the given file - Class
Specify String,Not null,Integer,Float and Date Columns with following flags: 
 1. I: for int 
 2. S: for string 
 3. F: float 
 4. D: Date 
 5.N: Columns should not be Null.
For column sepal length (cm) : 
Input Column Type: f
For column sepal width (cm) : 
Input Column Type: f
For column petal length (cm) : 
Input Column Type: f
For column petal width (cm) : 
Input Column Type: f
Specify Columns to be dropped: 
 1. Yes: To drop column from Dataset 
 2. No: To keep column in dataset
Do you want to drop sepal length (cm): Yes/No: n
Do you want to drop

# <span class="burk">Read Doc</span>: 
1. https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html
2. https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html
3. https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
4. https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html
5. https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html