## PROCESS FLOW CLASSES 

In [1]:
# Import Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from sklearn import preprocessing
from scipy import stats
import scipy.stats as ttest_ind
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import scale
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split,cross_val_score,cross_val_predict
import warnings 
warnings.filterwarnings("ignore")

In [7]:
# Loading Data
data=pd.read_csv("hmelq.csv")
class Loading_Data:
    def __init__ (self,data):
        self.data=data
    def translate_to_dataframe(self):# dataset is converted to dataframe
        return pd.DataFrame(self.data).head()# the first 5 observations of the data set are shown
# Data Information
class Information:
    def __init__ (self,data):
        self.data=data
    def info_data(self):
        print(self.data.info())    
        print(self.data.dtypes)
        print(self.data.shape)
        print(self.data.columns)
    def describe_missing_values(self):
        print(self.data.isnull().values.any()) # Are there any missing observations in the dataset? If there is True; If there is no False returns
        print(self.data.isnull().sum())# Prints the number of missing observations on the basis of variables
    def select_dtypes_numeric(self):
        df_numeric=self.data.select_dtypes(include=['float64','int64'])# numeric variables are selected
        return df_numeric
    def describe_data(self):# summary statistics information of numeric variables are accessed
        df_numeric=self.data.select_dtypes(include=['float64','int64'])# numeric variables are selected
        print(self.data.describe().T)
    def select_dtypes_category(self):
        df_category=self.data.select_dtypes(include=["object"])# categorical variables are selected
        print(df_category)
        for i in df_category.columns:
            print(self.data[i].value_counts()) # Prints the frequency information of categorical variables

# Exploratory Data Analysis (EDA)
class Visualizer:
    def __init__(self,data):
        self.data=data
    def msno_bar(self):
        plt.figure(figsize=(6,5))
        msno_bar = msno.bar(self.data,color='lightblue')
        return msno_bar
    def bar_plot(self,x=None,y=None,z = None):# Used to visualize barplot categorical variables
        plt.figure(figsize=(6,5))
        sns.barplot(x=x, y=y, hue=z, data=self.data)
    def box_plot(self,x=None,y=None,z=None): # continuous variables are visualized with the help of cartridges
        numeric_features=[x for x in data.columns if data[x].dtype!="object"]
        for i in data[numeric_features].columns:
            plt.figure(figsize=(6,5))
            plt.title(i)
            sns.boxplot(data=data[i])
    def hist_plot(self):
        df_numeric=self.data.select_dtypes(include=['float64','int64'])
        for i in df_numeric.columns:
            plt.figure()
            plt.hist(df_numeric[i],bins=100,color="orange")
            plt.title("Histogram of "+ i)
    def dist_plot(self,x=None,y=None,z=None):
        df_numeric=self.data.select_dtypes(include=['float64','int64'])
        df_numeric=df_numeric.dropna()
        for i in  df_numeric.columns:
            plt.figure()
            sns.distplot(np.array(df_numeric[i]),hist=False,kde=True,color="g")
            plt.title("Distplot  of "+ i)
    def reg_plot(self):
        plt.figure(figsize=(16, 7))
        df_numeric=self.data.select_dtypes(include=['float64','int64'])
        for i, column in enumerate(df_numeric.select_dtypes(exclude=['object']).columns[1:], 1):
            plt.subplot(2, 5, i)
            randNorm = np.random.normal(np.mean(df_numeric[column]), np.std(df_numeric[column]), len(df_numeric[column]))
            sns.regplot(np.sort(randNorm), np.sort(df_numeric[column]))
            plt.xlabel(f'{column}')
    def count_plot(self,x=None,y=None,z=None):
        plt.figure(figsize=(6,5))
        sns.countplot(x=x, y=y, hue=z, data=self.data)
    def correlation(self):
        fig,ax = plt.subplots(figsize=(10, 10))
        sns.heatmap(data.corr(), ax=ax, annot=True, 
        linewidths=0.05, fmt= '.2f',cmap="Blues")
        plt.show()
    def scatter_plot(self,x=None,y=None,z=None):
        return sns.scatterplot(x=x,y=y,data=self.data)
    def lm_plot(self,x=None,y=None,z=None,w=None,r=None):
        return sns.lmplot(x=x, y=y, hue=z,col=w,row=r, data=self.data)
    def swarm_plot(self,x=None,y=None,z=None):
        return sns.swarmplot(x=x, y=y,hue=z, data=self.data)
    def line_plot(self,x=None,y=None,z=None):
        return sns.lineplot(x=x,y=y,hue=z,data=self.data)
    def pair_plot(self,x=None,y=None,z=None,w=None):
        return sns.pairplot(self.data,hue=z)
    def cross_tab(self,x=None,y=None,n=None):
        numeric_features=[x for x in data.columns if data[x].dtype!="object"]
        for i in numeric_features.columns:
            return pd.crosstab(self.data[i],self.data[i],normalize=n).style.background_gradient(cmap="summer_r")
        
# Performing Hypothesis Testing
class HypothesisTesting:
    def __init__(self,data):
        self.data=data
    def normality_assumption(self):# normality assumption is realized by shapiro wilks test
        df_numeric=self.data.select_dtypes(include=['float64','int64'])
        for i in df_numeric.columns:
                df_new = df_numeric.dropna(subset=[i])
                stat, p = stats.shapiro(df_new[i])
                print("Statistics:%3.3f, p=%.3f " % (stat,p))
                alpha = 0.05
                if p>alpha:
                    print(i," için Orneklem Normal (Gaussian) Dagilimdan gelmektedir (Fail to Reject H0)")
                else:
                    print(i," için Orneklem Normal (Gaussian) Dagilimdan gelmemektedir (reject H0)")
        print("*****************************************************************************************")
    def assumption_of_variance_homogeneity(self,variable=None,x=None,y=None):#assumption of variance homogeneity is realized by levene test
        grps=pd.unique(data[variable].values)
        df_numeric=self.data.select_dtypes(include=['float64','int64'])
        for i in df_numeric.columns:
            for j in grps:
                df_new = data.dropna(subset=[i])
                stat, p = stats.levene(df_new[i][data[variable]==x],df_new[i][data[variable]==y])
                print("Statistics:%3.3f, p=%.3f " % (stat,p))
                alpha = 0.05
                if p>alpha:
                    print(i,j," için varyans homojendir. (Fail to Reject H0)")
                else:
                    print(i,j," için varyans homojen degildir. (reject H0)")
        print("*****************************************************************************************")

    def two_independent_samples_t_test(self,variable=None,x=None,y=None):
        df_numeric=self.data.select_dtypes(include=['float64','int64'])
        for i in df_numeric.columns:
            df_new= self.data.dropna(subset=[i])
            bad_risk=df_new[df_new[variable]==x][i]
            good_risk=df_new[df_new[variable]==y][i]
            t, p = stats.ttest_ind(bad_risk, good_risk, equal_var=False)
            print("ttest_ind: i=%s t = %g  p = %g" % (i,t, p))
            alpha = 0.05
            if p>alpha:
                print(i," ile  bad değişkeni arasında istatistiksel olarak anlamlı bir fark vardır.(Fail to Reject H0)")
            else:
                print(i," ile  bad değişkeni arasında istatistiksel olarak anlamlı bir fark yoktur.(reject H0)")
        print("*****************************************************************************************")
    def chi_square_t_test(self,x=None,y=None):
        data_cross_tab=pd.crosstab(index=data[x],columns=data[y])
        chi2,p,dof,expected=stats.chi2_contingency(data_cross_tab)
        results=[["Item","Value"],
                 ["Chi-Square Test",chi2],
                 ["p - value",p]]
        print("Chi-Square Test =%g p=%g" %(chi2,p))
        alpha = 0.05
        if p>alpha:
            print(x,"ve",y," degiskenleri birbirinden bağımsızdır.(Fail to Reject H0)")
        else:
            print(x,"ve",y," degiskenleri birbirinden bağımsız değildir(reject H0)")
        print("*****************************************************************************************")
        
# Data Preprocessing
class PreprocessStrategy:
    def __init__(self,data):
        self.data=data
    def fill_missing_value(self):
        data=self.data.dropna()
        return data.head()
    def fill_missing_value_with_mean(self):#fill in missing values in all variables with mean
        return self.data.apply(lambda x: x.fillna(x.mean()),axis=0)
    def fill_missing_value_with_median(self):#fill in missing values in all variables with median
        return self.data.apply(lambda x: x.fillna(x.median()),axis=0)
    def normalization(self):# converts variable values from 0 to 1
        return preprocessing.normalize(self.data)
    def one_hot_dummy_variable(self,variable=None):#It can be used to convert categorical variable to continuous variable. As a result, awareness among classes will be preserved.
        df_one_hot=self.data.copy()
        return pd.get_dummies(df_one_hot,columns=[variable],prefix=[variable]).head()
    def label_encoder(self,new_variable_name=None,categorical_variable_to_converted=None):# Performs conversions by the number of classes available
        lbe=preprocessing.LabelEncoder()
        data[new_variable_name]=lbe.fit_transform(data[categorical_variable_to_converted])
        return data[new_variable_name]
    def standardization(self):#a standardization is performed with an average of 0 standard deviations of one
        df_standardization=preprocessing.scale(self.data)
        return df_standardization
    def min_max_transformation(self,x=None,y=None):#Used to convert the values of a variable between two ranges that we want
        scaler=preprocessing.MinMaxScaler(feature_range=(x,y))
        return scaler.fit_transform(self.data)
    def binarize_transformation(self,threshold=None):#Converts the variable's values to 0 or 1 according to a certain threshold value
        binarizer=preprocessing.Binarizer(threshold=threshold).fit(self.data)
        return binarizer.transform(self.data)
        
        
# Data Modelling ,Performance/Evaluation metrics of the models
class GridSearchHelper():# model fit, model predict and model results are performed in this section
    def __init__(self,data):
        self.data=data
    def dataset_split(self,X_train=None,X_test=None,y_train=None,y_test=None):
        X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.20,random_state=42)
        return X_train,X_test,y_train,y_test
    def linear_regresyon(self,X_train=None,X_test=None,y_test=None,y_train=None):
        #X= df[[x]]# x independent variable
        #y=df[[y]] # y dependent variable 
        lm=LinearRegression()
        model=lm.fit(X_train,y_train)# model object created
        return model
    def pca(self,X_train=None,X_test=None,y_test=None,y_train=None):
        pca =PCA()
        X_reduced_test= pca.fit_transform(scale(X_test))
        print(np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)[0:5])
        lm=LinearRegression()
        y_train=y_train.fillna(y_train.mean())
        model=lm.fit(X_reduced_test,y_train)
        return model
    def logistic_regresyon(self,X_train=None,X_test=None,y_test=None,y_train=None,solver=None):
        loj=LogisticRegression(solver=solver)
        model=loj.fit(X_train,y_train)
        return model
    def fit_predict(self,X_test=None):
        y_pred=model.predict(X_test)
        print(y_pred)
    def show_evaluation_metrics_and_result_regression_model(self,X_test=None,y_test=None):
        print(model.intercept_)
        print(model.coef_)
        print(r2_score(y_test,y_pred))
        print(np.sqrt(mean_squared_error(y_test,model.predict(X_test))))#test error on the model
        print(np.sqrt(- cross_val_score(model,
                X_test,
                y_test,
                cv=10,
                scoring="neg_mean_squared_error")).mean())
    def show_evaluation_metrics_and_result_classification_model(self,X_test=None,y_test=None):
        print(accuracy_score(y_test,y_pred))
        print(cross_val_score(loj_model,X_test,y_test,cv=10).mean())
        print(classification_report(y_test,y_pred))
    def roc_curve(self,X_test=None,y_test=None):
        logit_roc_auc=roc_auc_score(y_test,loj_model.predict(X_test))
        fpr,tpr,threshold=roc_curve(y_test,loj_model.predict_proba(X_test)[:,1])
        plt.figure()
        plt.plot(fpr,tpr,label='AUC( area =%0.2f)' % logit_roc_auc)
        plt.plot([0,1],[0,1],'r--')
        plt.xlim([0.0,1.0])
        plt.ylim([0.0,1.05])
        plt.xlabel('False Positive Oranı')
        plt.ylabel('True Positive Oranı')
        plt.title('ROC')
        plt.show()
    def visualization_residual(self,model=None):
        return plt.plot(model.resid)
        
