In [805]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import shapiro
import matplotlib.pyplot as plt
from sklearn import preprocessing 

class DataDefinition():
    
    # General information and statistical data about the data set are accessed.
    def __init__(self, dataName):
        print("dataDefinition init çağrıldı")
        self.dataName = dataName
    
    
    def Information(self,data):
        print("---------------------Data Head---------------------")
        print(data.head())
        print("\n---------------------Data Describe---------------------")
        print(data.describe().T)
        print("\n---------------------Data Info---------------------")
        print(data.info())
        print("\n---------------------Data Columns---------------------")
        print(data.columns)
        print("\n---------------------Null Sayısı---------------------")
        print( data.isnull().sum().sum())
        
        
    def variable_type(self,data):
        kat_df = data.select_dtypes(include = ["object"])
        print("\n---------------------Categorical Variables---------------------")
        print(kat_df)
        print("\n---------------------Value Counts---------------------")
        kat_deg = []
        for katDeg in kat_df:
            print(katDeg,": ",data[katDeg].value_counts().count())
            kat_deg.append(katDeg)
        return kat_deg
            
        
    def categorical_variable_frequency(self, df, variables):
        print("\n---------------------Categorical variable frequency---------------------")
        for var in variables:
            print(df[var].value_counts())
            print("\n---------------------"+ var +"---------------------")
            print(df[var].value_counts().plot.barh())
            
        
    def missing_value(self, df):
        print("\n---------------------Is there any missing value?---------------------")
        print(df.isnull().values.any())
        print("\n---------------------Total number of null observations---------------------")
        print(df.isnull().sum().sum())
        print("\n---------------------Which variables have null values?---------------------")
        print(df.isnull().sum())
        print("\n---------------------Missing values deleted---------------------")
        df = df.dropna(axis=0)
        
    
    def Dummy(self, data, variable):
        dms = pd.get_dummies(data[variable])
        data =  data.drop([variable], axis = 1, inplace= True)
        data = pd.concat([data, dms], axis = 1)
        print("\n---------------------Dummy applied to "+ variable+ " variable---------------------")
        print(data.head())

    
    def NormalDistribution(self, data):
        print("\n---------------------Normal distribution control---------------------")
        for i in range (len(data.columns)):
            if (data.dtypes[i]!=object):   
                stat,p = shapiro(df.iloc[:,i])
                if (p>0.05):
                    print("Orneklem Normal (Gaussian) Dagilimdan gelmektedir (Fail to Reject H0)")
                else:
                    print("Orneklem Normal (Gaussian) Dagilimdan gelmemektedir")
                    
                    

class DataVisualization():
    
    # Provides visualization of the data set and target variable.
    def __init__(self, dataName):
        print("DataVisualization init çağrıldı")
        self.dataName = dataName
        
    
    def histogram(self, data, kat_deg):
        df = data.drop(kat_deg, axis=1)
        df.hist(figsize=(24,16), bins=40, xlabelsize=6, ylabelsize=6)
        plt.show()
            
            
    def heatmap(self, data):
        df = data.drop(kat_deg, axis=1)
        sns.heatmap(df);
    

class Preprocessing():
    
    #Deals with outliers in continuous variables
    def __init__(self, dataName):
        print("OutlierObservationAnalysis init çağrıldı")
        self.dataName = dataName
    
    
    def OutlierObservationDelete(self, df_table):
        Q1 = df_table.quantile(0.25)
        Q3 = df_table.quantile(0.75)
        IQR = Q3-Q1
        alt_sinir = Q1- 1.5*IQR
        ust_sinir = Q3 + 1.5*IQR
        aykiri_tf = ((df_table < (alt_sinir)) | (df_table > (ust_sinir)))
        new_table = df_table.drop(df_table[aykiri_tf].index, inplace= True)
        return new_table
    

    def Standardizasyon(self, data):
        print("\n---------------------Standardization---------------------")
        from sklearn import preprocessing 
        preprocessing.scale(data)
        
    
    def Normalizasyon(self, data):
        print("\n---------------------Normalization---------------------")
        from sklearn import preprocessing 
        preprocessing.normalize(data)
        
    
    def min_max(self, data):
        print("\n---------------------Min-Max Transform---------------------")
        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()
        scaler.fit(data)
        dd = scaler.transform(data)
        data = pd.DataFrame(dd, columns=data.columns)
        return data
        

class Correlation():
    
    def __init__(self, dataName):
        print("Models init çağrıldı")
        self.dataName = dataName
    
 
    def correlation(self, data):
        corr = data.corr()
        print("\n---------------------Correlation---------------------")
        print(corr)
        
        num_deg = []
        kat_deg = []

        for col in data:
            if(data.dtypes[col] == "object"):
                kat_deg.append(col)
            else:
                num_deg.append(col)
        

        columns = np.full((corr.shape[0],), True, dtype=bool)
        for i in range(corr.shape[0]):
            for j in range(i+1, corr.shape[0]):
                if corr.iloc[i,j] >= 0.7:
                    if columns[j]:
                        columns[j] = False
                        
        selected_columns = data[num_deg].columns[columns]
        data = data[selected_columns]
        print("\n---------------------Highly correlated variables deleted---------------------")
        print("\n---------------------New Data Shape---------------------")
        print(data.shape)
        return data
    
    
    def target_plot(self, data, target):
        import scipy.stats as stats
        import pylab
        print("\n---------------------Plot of target variable---------------------")
        sns.distplot(data[target])
        plt.show()
        print("\n---------------------Normality of the target variable---------------------")
        stats.probplot(data[target], dist="norm", plot=pylab)   #normal dağılım mı ?
        pylab.show()
    
    
class Processes():
    
    def __init__(self, dataName):
        print("Processes init çağrıldı")
        self.dataName = dataName
    
    
    def Split(self, df, target):
        import statsmodels.api as sm
        from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
        x_df = df.drop([target], axis=1)
        y_df  = df[target]

        x_train, x_test, y_train, y_test = train_test_split(x_df,y_df, test_size=0.2, random_state=0, shuffle = False)
        print("x_train: " + str(x_train.shape) + " x_test: " + str(x_test.shape) +" y_train: " + str(y_train.shape) +" y_test: " + str(y_test.shape) )
        return x_train, x_test, y_train, y_test
    
    
    def linear_regression(self, df, target):
        import numpy as np
        # MULTİPLE LİNEAR REGRESSİON
        print("\n---------------------MULTİPLE LİNEAR REGRESSİON---------------------")
        x_train, x_test, y_train, y_test = self.Split(df, target)
        lr = LinearRegression()
        lr.fit(x_train, y_train)
        y_pred = lr.predict(x_test)
        print(np.sqrt(mean_squared_error(self.y_test,y_pred)))
       
    
    def PCA_model(self, data, model):
        print("\n---------------------PCA Model---------------------")
        from sklearn.decomposition import PCA
        from sklearn.preprocessing import scale
        from sklearn.linear_model import LinearRegression
        from sklearn.metrics import mean_squared_error, r2_score
        
        pca = PCA()
        x_train, x_test, y_train, y_test = self.Split(data, target)
        X_reduced_train = pca.fit_transform(scale(X_train))
        print("X_reduced_train.shape: " + X_reduced_train.shape)
        
        #componentlerin açıklama oranlarının grafiği
        import matplotlib.pyplot as plt
        features = range(pca.n_components_)
        plt.bar(features, pca.explained_variance_ratio_, color='black')
        plt.xlabel('PCA features')
        plt.ylabel('variance %')
        plt.xticks(features);
        
        
        # veri seti içerisindeki değişkenliği açıklama oranları
        print(np.cumsum(np.round(pca.explained_variance_ratio_, decimals = 4)*100)[0:15])
        
        lm = LinearRegression()
        pca_model = lm.fit(X_reduced_train, y_train)
        y_pred = pca_model.predict(X_reduced_train)
        print("r2: " + str(r2_score(y_train, y_pred)))
    
    
    def lojistik_regresyon(self, data, model):
        print("\n---------------------Logistic Regression---------------------")
        from sklearn.linear_model import LogisticRegression
        y = data[target]
        X = data.drop([target], axis=1)
        
        loj = sm.Logit(y, X)
        loj_model= loj.fit()
        print(loj_model.summary())
        
        loj = LogisticRegression(solver = "liblinear")
        loj_model = loj.fit(X,y)
        print(loj_model)
        y_pred = loj_model.predict(X)
        print(confusion_matrix(y, y_pred))
        print("accuracy: " + str(accuracy_score(y, y_pred)))
        print(classification_report(y, y_pred))
        
        #threshold değiştirilebilir
        y_probs = loj_model.predict_proba(X)
        y_pred = [1 if i > 0.5 else 0 for i in y_probs]
        confusion_matrix(y, y_pred)
        print(accuracy_score(y, y_pred))
        print(classification_report(y, y_pred))
        
        #roc egrisi
        logit_roc_auc = roc_auc_score(y, loj_model.predict(X))

        fpr, tpr, thresholds = roc_curve(y, loj_model.predict_proba(X)[:,1])
        plt.figure()
        plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % logit_roc_auc)
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Oranı')
        plt.ylabel('True Positive Oranı')
        plt.title('ROC')
        plt.show()
        
        
        X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.30, random_state = 42)
        loj = LogisticRegression(solver = "liblinear")
        loj_model = loj.fit(X_train,y_train)
        print(accuracy_score(y_test, loj_model.predict(X_test)))
        print(cross_val_score(loj_model, X_test, y_test, cv = 10).mean())
        
        
    def Gaussian_Naive_Bayes(self, data, target):
        print("\n---------------------Gaussian Naive Bayes---------------------")
        from sklearn.naive_bayes import GaussianNB
        X_train, X_test, y_train, y_test = self.Split(data, target)
        nb = GaussianNB()
        nb_model = nb.fit(X_train, y_train)
        y_pred = nb_model.predict(X_test)
        print(accuracy_score(y_test, y_pred))
        print(cross_val_score(nb_model, X_test, y_test, cv = 10).mean())
        
    
    def KNN(self, data, target):
        print("\n---------------------KNN---------------------")
        X_train, X_test, y_train, y_test = self.Split(data, target)
        knn = KNeighborsClassifier()
        knn_model = knn.fit(X_train, y_train)
        y_pred = knn_model.predict(X_test)
        print(accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        
        knn_params = {"n_neighbors": np.arange(1,50)}
        knn = KNeighborsClassifier()
        knn_cv = GridSearchCV(knn, knn_params, cv=10)
        knn_cv.fit(X_train, y_train)
        
        print("En iyi skor:" + str(knn_cv.best_score_))
        print("En iyi parametreler: " + str(knn_cv.best_params_))
        knn = KNeighborsClassifier(knn_cv.best_params_)
        knn_tuned = knn.fit(X_train, y_train)
        print(knn_tuned.score(X_test, y_test))
        y_pred = knn_tuned.predict(X_test)
        print(accuracy_score(y_test, y_pred))
        
    
    def SVC(self, data, target):
        print("\n---------------------SVC---------------------")
        X_train, X_test, y_train, y_test = self.Split(data, target)
        svm_model = SVC(kernel = "linear").fit(X_train, y_train)
        y_pred = svm_model.predict(X_test)
        print(accuracy_score(y_test, y_pred))
        
        svc_params = {"C": np.arange(1,10)}
        svc = SVC(kernel= "linear")
        svc_cv_model = GridSearchCV(svc,svc_params, cv = 10, n_jobs = -1, verbose = 2 )
        svc_cv_model.fit(X_train, y_train)
        print("En iyi parametreler: " + str(svc_cv_model.best_params_))
        
        svc_tuned = SVC(svc_cv_model.best_params_).fit(X_train, y_train)
        y_pred = svc_tuned.predict(X_test)
        print(accuracy_score(y_test, y_pred))
    
    
    def CART(self, data, target):
        from sklearn.tree import DecisionTreeClassifier
        print("\n---------------------CART---------------------")
        X_train, X_test, y_train, y_test = self.Split(data, target)
        cart = DecisionTreeClassifier()
        cart_model = cart.fit(X_train, y_train)
        y_pred = cart_model.predict(X_test)
        print(accuracy_score(y_test, y_pred))
        
        cart_grid = {"max_depth": range(1,10),
            "min_samples_split" : list(range(2,50))}
          
        cart = tree.DecisionTreeClassifier()
        cart_cv = GridSearchCV(cart, cart_grid, cv = 10, n_jobs = -1, verbose = 2)
        cart_cv_model = cart_cv.fit(X_train, y_train)
        print("En iyi parametreler: " + str(cart_cv_model.best_params_))
        cart = tree.DecisionTreeClassifier(max_depth = 5, min_samples_split = 19)
        cart_tuned = cart.fit(X_train, y_train)
        y_pred = cart_tuned.predict(X_test)
        print(accuracy_score(y_test, y_pred))
    
    
    def Random_Forests(self, data, target):
        from sklearn.ensemble import RandomForestClassifier
        print("\n---------------------Random Forests---------------------")
        X_train, X_test, y_train, y_test = self.Split(data, target)
        rf_model = RandomForestClassifier().fit(X_train, y_train)
        y_pred = rf_model.predict(X_test)
        print(accuracy_score(y_test, y_pred))
        
        rf_params = {"max_depth": [2,5,8,10],
            "max_features": [2,5,8],
            "n_estimators": [10,500,1000],
            "min_samples_split": [2,5,10]}
        rf_model = RandomForestClassifier()

        rf_cv_model = GridSearchCV(rf_model, 
                           rf_params, 
                           cv = 10, 
                           n_jobs = -1, 
                           verbose = 2)
        rf_cv_model.fit(X_train, y_train)
        print("En iyi parametreler: " + str(rf_cv_model.best_params_))
        rf_tuned = RandomForestClassifier(rf_cv_model.best_params_)
        rf_tuned.fit(X_train, y_train)
        y_pred = rf_tuned.predict(X_test)
        print(accuracy_score(y_test, y_pred))
        
        Importance = pd.DataFrame({"Importance": rf_tuned.feature_importances_*100},
                         index = X_train.columns)
        
        Importance.sort_values(by = "Importance", axis = 0, ascending = True).plot(kind ="barh", color = "r")
        plt.xlabel("Değişken Önem Düzeyleri")
        
        
    def neural_network(self, data, target):
        from sklearn.preprocessing import StandardScaler 
        from sklearn.neural_network import MLPClassifier
        print("\n---------------------Neural_Network---------------------")
        X_train, X_test, y_train, y_test = self.Split(data, target)
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        mlpc = MLPClassifier().fit(X_train_scaled, y_train)
        y_pred = mlpc.predict(X_test_scaled)
        print("accuracy :" + str(accuracy_score(y_test, y_pred))
        mlpc_params = {"alpha": [0.1, 0.01, 0.02, 0.005, 0.0001,0.00001],
              "hidden_layer_sizes": [(10,10,10),
                                     (100,100,100),
                                     (100,100),
                                     (3,5), 
                                     (5, 3)],
              "solver" : ["lbfgs","adam","sgd"],
              "activation": ["relu","logistic"]}
              
        mlpc = MLPClassifier()
        mlpc_cv_model = GridSearchCV(mlpc, mlpc_params, 
                             cv = 10, 
                             n_jobs = -1,
                             verbose = 2)

        mlpc_cv_model.fit(X_train_scaled, y_train)
        print("En iyi parametreler: " + str(mlpc_cv_model.best_params_))
        mlpc_tuned = MLPClassifier(mlpc_cv_model.best_params_)
        mlpc_tuned.fit(X_train_scaled, y_train)
        y_pred = mlpc_tuned.predict(X_test_scaled)
        print("new accuracy: " + str(accuracy_score(y_test, y_pred)))
        
    
    def Gradient_Boosting_Machines(self, data, target):
        from sklearn.ensemble import GradientBoostingClassifier
        print("\n---------------------Gradient Boosting Machines---------------------")
        X_train, X_test, y_train, y_test = self.Split(data, target)
        gbm_model = GradientBoostingClassifier().fit(X_train, y_train)
        y_pred = gbm_model.predict(X_test)
        print(accuracy_score(y_test, y_pred))
        gbm_params = {"learning_rate" : [0.001, 0.01, 0.1, 0.05],
             "n_estimators": [100,500,100],
             "max_depth": [3,5,10],
             "min_samples_split": [2,5,10]}
        gbm = GradientBoostingClassifier()
        gbm_cv = GridSearchCV(gbm, gbm_params, cv = 10, n_jobs = -1, verbose = 2)
        gbm_cv.fit(X_train, y_train)
        print("En iyi parametreler: " + str(gbm_cv.best_params_))
        gbm = GradientBoostingClassifier(gbm_cv.best_params_)
        gbm_tuned =  gbm.fit(X_train,y_train)
        y_pred = gbm_tuned.predict(X_test)
        print(accuracy_score(y_test, y_pred))
    
    
    def XGBoost(self, data, target):
        from xgboost import XGBClassifier
        print("\n---------------------XGBoost---------------------")
        X_train, X_test, y_train, y_test = self.Split(data, target)
        xgb_model = XGBClassifier().fit(X_train, y_train)
        y_pred = xgb_model.predict(X_test)
        print(accuracy_score(y_test, y_pred))
        
        xgb_params = {
        'n_estimators': [100, 500, 1000, 2000],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5,6],
        'learning_rate': [0.1,0.01,0.02,0.05],
        "min_samples_split": [2,5,10]}
        
        xgb = XGBClassifier()

        xgb_cv_model = GridSearchCV(xgb, xgb_params, cv = 10, n_jobs = -1, verbose = 2)
        xgb_cv_model.fit(X_train, y_train)
        xgb = XGBClassifier(xgb_cv_model.best_params_)
        xgb_tuned =  xgb.fit(X_train,y_train)
        y_pred = xgb_tuned.predict(X_test)
        print(accuracy_score(y_test, y_pred))
        
        
    def CatBoost(self, data, target):
        from catboost import CatBoostClassifier
        print("\n---------------------CatBoost---------------------")
        X_train, X_test, y_train, y_test = self.Split(data, target)
        cat_model = CatBoostClassifier().fit(X_train, y_train)
        y_pred = cat_model.predict(X_test)
        print(accuracy_score(y_test, y_pred))
        
        catb_params = {
            'iterations': [200,500],
            'learning_rate': [0.01,0.05, 0.1],
            'depth': [3,5,8] }
        catb = CatBoostClassifier()
        catb_cv_model = GridSearchCV(catb, catb_params, cv=5, n_jobs = -1, verbose = 2)
        catb_cv_model.fit(X_train, y_train)
        catb = CatBoostClassifier(catb_cv_model.best_params_)
        catb_tuned = catb.fit(X_train, y_train)
        y_pred = catb_tuned.predict(X_test)catb_cv_model.best_params_
        y_pred = catb_tuned.predict(X_test)
        print(accuracy_score(y_test, y_pred))

In [779]:
data = pd.read_excel("HW_Data_Set.xlsx")
DataName="HW_Data_Set.xlsx"
df = data.copy()
data_definition = DataDefinition("HW_Data_Set.xlsx") 

dataDefinition init çağrıldı
