In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score,roc_curve
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
#from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
#from catboost import CatBoostClassifier

from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

from warnings import filterwarnings
filterwarnings('ignore')



In [2]:
from termcolor import colored

In [3]:


class Information:
    
    def __init__(self,data):
        self.data = data;
        
    def data_features(self):
        print("--------------------------  DATA HEAD --------------------------\n")
        print(self.data.head())
        print()
        
        print("--------------------------  DATA DESCRIBE --------------------------\n")
        print(self.data.describe().T)
        print()

        print("--------------------------  DATA INFO --------------------------\n")
        print(self.data.info())
        print()
     
        print("--------------------------  DATA SHAPE --------------------------\n")
        print(self.data.shape)
        print()
    

In [4]:
class Visualizer:
    def __init__(self):
        print()
    
    def barplot(self,column1,column2):
        pd.crosstab(column1,column2).plot(kind='bar')
        plt.show()
    
    
    def missing_values(self,data):
        self.data = data
        plt.subplots(figsize=(12, 6))
        plt.title('Missing Values')
        sns.heatmap(self.data.isnull(), yticklabels = False, cmap="viridis")
        plt.show()
        
    def confusion_matris(self,y,y_pred):
        cm = confusion_matrix(y, y_pred)

        fig, ax = plt.subplots(figsize=(5, 5))
        ax.imshow(cm)
        ax.grid(False)
        ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
        ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
        ax.set_ylim(1.5, -0.5)
        for i in range(2):
            for j in range(2):
                ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
        plt.show()



In [5]:
# enum, kayıp gözlem var mı , varsa hangi yöntemlerle doldurulmalı
class Preprocess: 

    
    def __init__(self,data):
        self.data = data
        
    def get_missing_values(self):
        missing_values = self.data.isnull().sum()
        missing_values.sort_values(ascending=False, inplace=True)
        print(colored("------------  Missing Values  -------------",'blue'))
        return missing_values
    
    # column lardaki boş olan yerleri hangi metoda göre dolduracağımız: mean, medyan,mod ya da seçeceğimiz belirli bir değer
    
    def imputation(self,column,method):
        
        if method == "median":
            self.data[column] = self.data[column].fillna(self.data[column].median())
            
        elif method == "mode":
            self.data[column] = self.data[column].fillna(self.data[column].mode())
            
        elif method == "mean":
            self.data[column] = self.data[column].fillna(self.data[column].mean())
            
        else:
             self.data[column] = self.data[column].fillna(method)
                
        return self.get_missing_values()
    
    def drop(self,method):
        
        if method == "any":   #herhangi bir satırda NaN değeri varsa o satırı sil demek.
            print("Drop Öncesi Data Shape -->   ",self.data.shape)
            self.data.dropna(how="any",inplace=True)
            print("Drop Sonrası Data Shape -->   ",self.data.shape)
            
        elif method == "all":   #Tüm satır nan değerinde ise siler.
            self.data.dropna(how="all",inplace=True)
        
        # !!!!
        
        else:                  #sadece datanın method (method değerini data columnlarından birini yazarsak) oranındaki nan valuları siler.
            self.data.dropna(subset=[method],inplace=True)
            
        
        return self.get_missing_values()
    
    
    def SMOTE(self, X, y):
        
        os = SMOTE(random_state=0)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
        columns = X_train.columns
        os_data_X,os_data_y=os.fit_sample(X_train, y_train)
        os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
        os_data_y= pd.DataFrame(data=os_data_y,columns=y_train.columns)
        
        return os_data_X, os_data_y
        
        
        
        


In [6]:
class GridSearchHelper:
    
       
    def __init__(self):
        print("Grid Search Helper")
        
    
    def LogReg(self,X,y):
        self.X =X
        self.y =y
        
        print(colored('------------------     STATS  MODELS  --------------------\n','blue'))
        lj = sm.Logit(self.y,self.X)
        ljm = lj.fit()
        
        print(ljm.summary())
        
        
        print()
        print(colored('------------------     SCIKIT LEARN MODEL  --------------------\n','blue'))
        
        #Scikit Learn Model
        loj =LogisticRegression(solver = "liblinear")

        loj_model = loj.fit(self.X,self.y)

        print(colored("Intercept :  ",'red'),loj_model.intercept_)
        print()
        print(colored("Coefficient  : ",'red'),loj_model.coef_)
        print()
        
        print(colored('-------------- CONFUSION MATRIS  --------------------\n','blue'))
        y_pred = loj_model.predict(self.X)
        visualize = Visualizer()
        visualize.confusion_matris(self.y, y_pred)
        
        print(colored("Accuracy  Score :  ",'red'), accuracy_score(self.y,y_pred))
        print(colored("Classification Report  : ",'red'))
        print(classification_report(self.y,y_pred))
        
        print(colored('-------------- ROC CURVE  --------------------\n','blue'))
        logit_roc_auc = roc_auc_score(self.y,y_pred)

        fpr, tpr, thresholds = roc_curve(self.y, loj_model.predict_proba(X)[:,1])

        plt.figure()
        plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % logit_roc_auc)
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Oranı')
        plt.ylabel('True Positive Oranı')
        plt.title('ROC')
        plt.show()
        
        
        
        print(colored('-------------- TRAIN - TEST SPLIT  --------------------\n','blue'))
        
        X_train, X_test, y_train, y_test = train_test_split(self.X,self.y,test_size = 0.2, random_state = 42)
        loj_ = LogisticRegression(solver ="liblinear")
        loj_model_ = loj.fit(X_train, y_train)
        
        print(colored('Accuracy  Score  : ', 'red'),accuracy_score(y_test, loj_model_.predict(X_test)))
        
        print(colored("Classification Report  : ",'red'))
        print(classification_report(y_test,loj_model_.predict(X_test)))
        
        print(colored("Cross Validation Score  :  ",'red'),cross_val_score(loj_model, X_test, y_test, cv =10).mean())
        
        
       
   