In [None]:

import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
import scipy.stats as stats

# Load data
data = pd.read_csv('bow&dic/bow_train_over15.csv')
DF_file = pd.ExcelFile('bow&dic/df_Dic_org_over15_t.xlsx')
dfreq = DF_file.parse('Sheet1')
data.columns = dfreq.columns
data = data.drop("['positive'", axis=1)
#data = data.drop(columns=[ '0.1'])# delete positive column
#data = data[:100] 
"""
import seaborn.apionly as sns
from sklearn import preprocessing
data = sns.load_dataset('iris')
le = preprocessing.LabelEncoder()
#convert the categorical columns into numeric
data['species'] = le.fit_transform(data['species'])
"""

row_count = data.shape[0]
split_point = int(row_count*4/5)
train_data, test_data = data[:split_point], data[split_point:]
###########
#X_train, X_test, y_train, y_test = train_test_split(data.loc[:,data.columns != 'species'], data['species'], test_size=0.2)
###########
#train_data = np.c_[ X_train,y_train ]
###########
#train_data = pd.concat([ X_train,y_train],axis=1)
#test_data = pd.concat([ X_test,y_test],axis=1)
###########
#test_data = np.c_[ X_test,y_test ]

def gini(array):
    array = np.array(array)
    array = array.flatten()
    if np.amin(array) < 0:
        # Values cannot be negative:
        array -= np.amin(array)
    # Values cannot be 0:
    array = array + 0.0000001
    # Values must be sorted:
    array = np.sort(array)
    # Index per array element:
    index = np.arange(1,array.shape[0]+1)
    # Number of array elements:
    n = array.shape[0]
    # Gini coefficient/index/ratio:
    return ((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array)))


def feature_selection_technique(train, test, technique ,nFeature):
    
    train_label = train.iloc[:,0]
    test_label = test.iloc[:,0]
    X = train.iloc[:,1:]
    Y = train_label

    """
    train_label = y_train #train['species']
    test_label = y_test #test['species']
    Y = train_label
    X = train.iloc[:,0:-1]
    """
    
    if technique == "Correlation":
        i=1;
        corr_indexes = pd.Series(np.zeros([X.shape[1]]),index=X.columns )
        for col in X.columns:
            this_column = train.iloc[:,i]
            temp = pd.concat([ train_label,this_column ],axis=1)
            cor = temp.corr(method='pearson')
            corr_indexes[i-1] = abs(cor["['negative'"])[1]
            i+=1;
        relevant_features = corr_indexes.sort_values(ascending=False)
        new_train = train[relevant_features[0:nFeature+1].index] 
        new_test = test[relevant_features[0: nFeature+1].index]
        relevant_features = relevant_features[0:nFeature+1]
        train_label =train_label.to_frame()
        new_train = pd.concat([ train_label,new_train ],axis=1)#first column is label
        test_label =test_label.to_frame()
        new_test = pd.concat([ test_label,new_test],axis=1)#first column is label
        
    elif technique == "Correlation&DF":
        i=1;
        corr_indexes = pd.Series(np.zeros([X.shape[1]]),index=X.columns )
        for col in X.columns:
            this_column = train.iloc[:,i]
            temp = pd.concat([ train_label,this_column ],axis=1)
            cor = temp.corr(method='pearson')
            corr_indexes[i-1] = abs(cor["['negative'"])[1]
            i+=1;
        dfrec =  dfreq.loc[0][2:] / dfreq.loc[0][2:].max()
        relevant_features = corr_indexes + dfrec
        relevant_features = relevant_features.sort_values(ascending=False)
        
        new_train = train[relevant_features[0:nFeature+1].index] 
        new_test = test[relevant_features[0: nFeature+1].index]
        relevant_features = relevant_features[0:nFeature+1]
        train_label =train_label.to_frame()
        new_train = pd.concat([ train_label,new_train ],axis=1)#first column is label
        test_label =test_label.to_frame()
        new_test = pd.concat([ test_label,new_test],axis=1)#first column is label
        
    elif technique == "Odds_Ratio&DF":
        i=1;
        odds_ratios = pd.Series(np.zeros([X.shape[1]]),index=X.columns )
        for col in X.columns:
            this_column = train.iloc[:,i]
            table = np.zeros((2, 2))
            temp = this_column[train_label==1]
            table[0,0] = temp[temp == 1].shape[0]
            temp = this_column[train_label==0]
            table[1,0] = temp[temp == 1].shape[0]
            temp = this_column[train_label==1]
            table[0,1] = temp[temp == 0].shape[0]
            temp = this_column[train_label==0]
            table[1,1] = temp[temp == 0].shape[0]
            oddsratio, pvalue = stats.fisher_exact(table)
            odds_ratios[i-1] = oddsratio
            i+=1;
            
        dfrec =  dfreq.loc[0][2:] / dfreq.loc[0][2:].max()
        relevant_features = odds_ratios + dfrec
        relevant_features = relevant_features.sort_values(ascending=False)
        
        new_train = train[relevant_features[0:nFeature+1].index] 
        new_test = test[relevant_features[0: nFeature+1].index]
        relevant_features = relevant_features[0:nFeature+1]
        train_label =train_label.to_frame()
        new_train = pd.concat([ train_label,new_train ],axis=1)#first column is label
        test_label =test_label.to_frame()
        new_test = pd.concat([ test_label,new_test],axis=1)#first column is label
        
    elif technique == "Correlation&DFS":
        i=1;
        corr_indexes = pd.Series(np.zeros([X.shape[1]]),index=X.columns )
        DFS_weights = pd.Series(np.zeros([X.shape[1]]),index=X.columns )
        for col in X.columns:
            #correlation
            this_column = train.iloc[:,i]
            temp0 = pd.concat([ train_label,this_column ],axis=1)
            cor = temp0.corr(method='pearson')
            corr_indexes[i-1] = abs(cor["['negative'"])[1]
            
            #DFS
            org_table = np.zeros((2, 1))# number of occurrence per each class
            temp = this_column[train_label==1]
            org_table[0,0] = temp[temp == 1].shape[0]
            temp = this_column[train_label==0]
            org_table[1,0] = temp[temp == 1].shape[0]
            occurrence_of_word = train.iloc[:,i].sum()
            numerator = org_table / occurrence_of_word # calculate numerator of DFS
            # calculate denominator of DFS
            n_negative = train_label.sum() # number of negative samples
            n_positive = train_label.shape[0] - n_negative # number of positive samples
            s1 = ((n_negative - org_table.item(0)) / n_negative ) + (( org_table.item(1)) / n_positive ) +1
            s2 = ((n_positive - org_table.item(1)) / n_positive ) + (( org_table.item(0)) / n_negative ) +1
            # calculate DFS
            s = np.array([s1, s2])
            s = np.transpose(numerator)/s
            DFS_weights[i-1] = s.sum()
            i+=1;
        relevant_features = corr_indexes + DFS_weights
        relevant_features = relevant_features.sort_values(ascending=False)
        
        new_train = train[relevant_features[0:nFeature+1].index] 
        new_test = test[relevant_features[0: nFeature+1].index]
        relevant_features = relevant_features[0:nFeature+1]
        train_label =train_label.to_frame()
        new_train = pd.concat([ train_label,new_train ],axis=1)#first column is label
        test_label =test_label.to_frame()
        new_test = pd.concat([ test_label,new_test],axis=1)#first column is label
    
    elif technique == "Odds_Ratio&DFS":
        i=1;
        odds_ratios = pd.Series(np.zeros([X.shape[1]]),index=X.columns )
        DFS_weights = pd.Series(np.zeros([X.shape[1]]),index=X.columns )
        for col in X.columns:
            #odds_ratio
            this_column = train.iloc[:,i]
            table = np.zeros((2, 2))
            temp = this_column[train_label==1]
            table[0,0] = temp[temp == 1].shape[0]
            temp = this_column[train_label==0]
            table[1,0] = temp[temp == 1].shape[0]
            temp = this_column[train_label==1]
            table[0,1] = temp[temp == 0].shape[0]
            temp = this_column[train_label==0]
            table[1,1] = temp[temp == 0].shape[0]
            oddsratio, pvalue = stats.fisher_exact(table)
            odds_ratios[i-1] = oddsratio
            
            #DFS
            org_table = np.zeros((2, 1))# number of occurrence per each class
            temp = this_column[train_label==1]
            org_table[0,0] = temp[temp == 1].shape[0]
            temp = this_column[train_label==0]
            org_table[1,0] = temp[temp == 1].shape[0]
            occurrence_of_word = train.iloc[:,i].sum()
            numerator = org_table / occurrence_of_word # calculate numerator of DFS
            # calculate denominator of DFS
            n_negative = train_label.sum() # number of negative samples
            n_positive = train_label.shape[0] - n_negative # number of positive samples
            s1 = ((n_negative - org_table.item(0)) / n_negative ) + (( org_table.item(1)) / n_positive ) +1
            s2 = ((n_positive - org_table.item(1)) / n_positive ) + (( org_table.item(0)) / n_negative ) +1
            # calculate DFS
            s = np.array([s1, s2])
            s = np.transpose(numerator)/s
            DFS_weights[i-1] = s.sum()
            i+=1;   
        relevant_features = odds_ratios + DFS_weights
        relevant_features = relevant_features.sort_values(ascending=False)
        
        new_train = train[relevant_features[0:nFeature+1].index] 
        new_test = test[relevant_features[0: nFeature+1].index]
        relevant_features = relevant_features[0:nFeature+1]
        train_label =train_label.to_frame()
        new_train = pd.concat([ train_label,new_train ],axis=1)#first column is label
        test_label =test_label.to_frame()
        new_test = pd.concat([ test_label,new_test],axis=1)#first column is label
        
    elif technique == "Correlation&Gini":
        i=1;
        corr_indexes = pd.Series(np.zeros([X.shape[1]]),index=X.columns )
        gini_indexes = pd.Series(np.zeros([X.shape[1]]),index=X.columns )
        for col in X.columns:
            #correlation
            this_column = train.iloc[:,i]
            temp0 = pd.concat([ train_label,this_column ],axis=1)
            cor = temp0.corr(method='pearson')
            corr_indexes[i-1] = abs(cor["['negative'"])[1]
            
            #Gini
            gini_indexes[i-1] = gini(X[col])
            i+=1;
        relevant_features = corr_indexes + gini_indexes
        relevant_features = relevant_features.sort_values(ascending=False)
        
        new_train = train[relevant_features[0:nFeature+1].index] 
        new_test = test[relevant_features[0: nFeature+1].index]
        relevant_features = relevant_features[0:nFeature+1]
        train_label =train_label.to_frame()
        new_train = pd.concat([ train_label,new_train ],axis=1)#first column is label
        test_label =test_label.to_frame()
        new_test = pd.concat([ test_label,new_test],axis=1)#first column is label
        
    elif technique == "Correlation&Information_Gain":
        cor = train.corr(method='pearson')
        #Correlation with output label
        cor_target = abs(cor["['negative'"])
        #Selecting highly correlated features
        #relevant_features = cor_target.sort_values(ascending=False)
        
        #Info_Gain
        mutual_inf_target = mutual_info_classif(train, train_label)
        
        relation_weight = cor_target * mutual_inf_target
        relevant_features = relation_weight.sort_values(ascending=False)
        
        new_train = train[relevant_features[0:nFeature+1].index] #first column is label
        new_test = test[relevant_features[0: nFeature+1].index]
        relevant_features = relevant_features[1:nFeature+1]
        
    elif technique == "Odds_Ratio&Information_Gain":
        i=1;
        odds_ratios = pd.Series(np.zeros([X.shape[1]]),index=X.columns )
        for col in X.columns:
            #Odd ratio
            this_column = train.iloc[:,i]
            table = np.zeros((2, 2))
            temp = this_column[train_label==1]
            table[0,0] = temp[temp == 1].shape[0]
            temp = this_column[train_label==0]
            table[1,0] = temp[temp == 1].shape[0]
            temp = this_column[train_label==1]
            table[0,1] = temp[temp == 0].shape[0]
            temp = this_column[train_label==0]
            table[1,1] = temp[temp == 0].shape[0]
            oddsratio, pvalue = stats.fisher_exact(table)
            odds_ratios[i-1] = oddsratio
            i+=1;
        #Information gain
        mutual_inf_target = mutual_info_classif(X, train_label)
        
        relevant_features = odds_ratios + mutual_inf_target
        relevant_features = relevant_features.sort_values(ascending=False)
        
        new_train = train[relevant_features[0:nFeature+1].index] 
        new_test = test[relevant_features[0: nFeature+1].index]
        relevant_features = relevant_features[0:nFeature+1]
        train_label =train_label.to_frame()
        new_train = pd.concat([ train_label,new_train ],axis=1)#first column is label
        test_label =test_label.to_frame()
        new_test = pd.concat([ test_label,new_test],axis=1)#first column is label
        
    elif technique == "Information_Gain":
        selector = SelectKBest(mutual_info_classif, k=nFeature)
        new_train = selector.fit_transform(X,Y)
        relevant_features = X.columns[selector.get_support(indices=True)].tolist()
        new_test = test[relevant_features]
        train_label=np.array(train_label)
        test_label=np.array(test_label)
        new_train = np.c_[ train_label,new_train ]#first column is label
        new_test = np.c_[ test_label,new_test ]#first column is label
        
    elif technique == "Gini_Index":
        i=0;
        gini_indexes = pd.Series(np.zeros([X.shape[1]]),index=X.columns )
        for col in X.columns:
            gini_indexes[i] = gini(X[col])
            i+=1;
        relevant_features = gini_indexes.sort_values(ascending=False)
        new_train = train[relevant_features[0:nFeature+1].index] 
        new_test = test[relevant_features[0: nFeature+1].index]
        relevant_features = relevant_features[0:nFeature+1]
        train_label =train_label.to_frame()
        new_train = pd.concat([ train_label,new_train ],axis=1)#first column is label
        test_label =test_label.to_frame()
        new_test = pd.concat([ test_label,new_test],axis=1)#first column is label
    
    elif technique == "Document_Frequency":
        relevant_features = dfreq.loc[0][:].sort_values(ascending=False) 
        #relevant_features = relevant_features / max(relevant_features)
        new_train = train[relevant_features[2:nFeature+3].index] #first & second are NEGATIVE & POSITIVE
        new_test = test[relevant_features[2: nFeature+3].index]
        relevant_features = relevant_features[2:nFeature+3]
        train_label =train_label.to_frame()
        new_train = pd.concat([ train_label,new_train ],axis=1)#first column is label
        test_label =test_label.to_frame()
        new_test = pd.concat([ test_label,new_test],axis=1)#first column is label
        
    elif technique == "Odds_Ratio":
        i=1;
        odds_ratios = pd.Series(np.zeros([X.shape[1]]),index=X.columns )
        for col in X.columns:
            this_column = train.iloc[:,i]
            table = np.zeros((2, 2))
            temp = this_column[train_label==1]
            table[0,0] = temp[temp == 1].shape[0]
            temp = this_column[train_label==0]
            table[1,0] = temp[temp == 1].shape[0]
            temp = this_column[train_label==1]
            table[0,1] = temp[temp == 0].shape[0]
            temp = this_column[train_label==0]
            table[1,1] = temp[temp == 0].shape[0]
            oddsratio, pvalue = stats.fisher_exact(table)
            odds_ratios[i-1] = oddsratio
            i+=1;
        relevant_features = odds_ratios.sort_values(ascending=False)
        new_train = train[relevant_features[0:nFeature+1].index] 
        new_test = test[relevant_features[0: nFeature+1].index]
        relevant_features = relevant_features[0:nFeature+1]
        train_label =train_label.to_frame()
        new_train = pd.concat([ train_label,new_train ],axis=1)#first column is label
        test_label =test_label.to_frame()
        new_test = pd.concat([ test_label,new_test],axis=1)#first column is label
        
    elif technique == "DFS":
        i=1;
        DFS_weights = pd.Series(np.zeros([X.shape[1]]),index=X.columns )
        for col in X.columns:
            this_column = train.iloc[:,i]
            org_table = np.zeros((2, 1))# number of occurrence per each class
            temp = this_column[train_label==1]
            org_table[0,0] = temp[temp == 1].shape[0]
            temp = this_column[train_label==0]
            org_table[1,0] = temp[temp == 1].shape[0]
            occurrence_of_word = train.iloc[:,i].sum()
            numerator = org_table / occurrence_of_word # calculate numerator of DFS
            # calculate denominator of DFS
            n_negative = train_label.sum() # number of negative samples
            n_positive = train_label.shape[0] - n_negative # number of positive samples
            s1 = ((n_negative - org_table.item(0)) / n_negative ) + (( org_table.item(1)) / n_positive ) +1
            s2 = ((n_positive - org_table.item(1)) / n_positive ) + (( org_table.item(0)) / n_negative ) +1
            # calculate DFS
            s = np.array([s1, s2])
            s = np.transpose(numerator)/s
            DFS_weights[i-1] = s.sum()
            i+=1;
        relevant_features = DFS_weights.sort_values(ascending=False)
        new_train = train[relevant_features[0:nFeature+1].index]
        new_test = test[relevant_features[0: nFeature+1].index]
        relevant_features = relevant_features[0:nFeature+1]
        train_label =train_label.to_frame()
        new_train = pd.concat([ train_label,new_train ],axis=1)#first column is label
        test_label =test_label.to_frame()
        new_test = pd.concat([ test_label,new_test],axis=1)#first column is label
    
    elif technique == "Gini_Index&Odds_Ratio":
        i=0;
        gini_indexes = pd.Series(np.zeros([X.shape[1]]),index=X.columns )
        odds_ratios = pd.Series(np.zeros([X.shape[1]]),index=X.columns )
        for col in X.columns:
            #Gini
            gini_indexes[i] = gini(X[col])
            #Odds ratio
            this_column = train.iloc[:,i]
            table = np.zeros((2, 2))
            temp = this_column[train_label==1]
            table[0,0] = temp[temp == 1].shape[0]
            temp = this_column[train_label==0]
            table[1,0] = temp[temp == 1].shape[0]
            temp = this_column[train_label==1]
            table[0,1] = temp[temp == 0].shape[0]
            temp = this_column[train_label==0]
            table[1,1] = temp[temp == 0].shape[0]
            oddsratio, pvalue = stats.fisher_exact(table)
            odds_ratios[i] = oddsratio
            i+=1;
        gini_odds = gini_indexes * odds_ratios
        relevant_features = gini_odds.sort_values(ascending=False)
        new_train = train[relevant_features[0:nFeature+1].index] 
        new_test = test[relevant_features[0: nFeature+1].index]
        relevant_features = relevant_features[0:nFeature+1]
        train_label =train_label.to_frame()
        new_train = pd.concat([ train_label,new_train ],axis=1)#first column is label
        test_label =test_label.to_frame()
        new_test = pd.concat([ test_label,new_test],axis=1)#first column is label    
        
    elif technique == "Gini_Indexes&PCA":
        #Gini
        i=0;
        gini_indexes = pd.Series(np.zeros([X.shape[1]]),index=X.columns )
        for col in X.columns:
            gini_indexes[i] = gini(X[col])
            i+=1;
        relevant_features = gini_indexes.sort_values(ascending=False)
        new_train = train[relevant_features[0:(1*nFeature)+1].index]
        new_test = test[relevant_features[0: (1*nFeature)+1].index]
        relevant_features = relevant_features[0:(1*nFeature)+1]
        #train_label =train_label.to_frame()
        #test_label =test_label.to_frame()
        #PCA
        pca = PCA(n_components=nFeature)
        pca.fit(new_train)
        new_train = pca.transform(new_train)
        new_test = pca.transform(new_test)
        new_train = np.c_[ train_label,new_train ]#first column is label
        new_test = np.c_[ test_label,new_test]#first column is label
    
    print(relevant_features)
    return new_train, new_test ,relevant_features

# classify data
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

classifiers = [
    GaussianNB(),
    DecisionTreeClassifier(),
    MLPClassifier(alpha=1e-3,hidden_layer_sizes=(10),max_iter=100),
    SVC(probability=True)]
#techniques = [ 'Document_Frequency', 'Information_Gain', 'Gini_Index', 
#              'DFS','Odds_Ratio','Correlation','Gini_Index&Odds_Ratio',
#              'Gini_Indexes&PCA','Correlation&Information_Gain']
techniques = [ 'Correlation&DF', 'Correlation&DFS', 'Odds_Ratio&DF', 
              'Odds_Ratio&DFS','Odds_Ratio&Information_Gain','Correlation&Gini']

for Technique in techniques:
    #Number Of selected Features
    number_of_feature = 100
    #Feature Selection
    new_Train,new_Test,Relevant_Features =  feature_selection_technique(train_data, test_data, Technique, number_of_feature)
    # set new train and test
    if Technique == "Information_Gain" or Technique == "Gini_Indexes&PCA":
        X_train = new_Train[0::, 1::] # second column to last column
        y_train = new_Train[0::, 0] # just first column is lable
        X_test = new_Test[0::, 1::]
        y_test = new_Test[0::, 0]
    else:
        X_train = new_Train.iloc[:,1:] # second column to last column
        y_train = new_Train.iloc[:,0] # just first column is lable
        X_test = new_Test.iloc[:,1:]
        y_test = new_Test.iloc[:,0]
    
    # run classifiers for this technique
    for clf in classifiers:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
    
        # Calculate Accuracy Rate by using accuracy_score()
        print(Technique+'---------------------------------------')
        print (clf.__class__.__name__ + " test Accuracy Rate is: %f" % accuracy_score(y_test, y_pred))

        print (clf.__class__.__name__ + " F1_micro is: %f" % f1_score(y_test, y_pred, average='micro'))
        print (clf.__class__.__name__ + " F1_macro is: %f" % f1_score(y_test, y_pred, average='macro'))
        f1_sc = f1_score(y_test, y_pred, average=None)
        print('f1_score is: %f' % f1_sc.mean())
        print("RMSE= " ,np.sqrt(mean_squared_error(y_test,y_pred)) )

        scores = cross_val_score(clf, X_train, y_train, cv=10)
        print("validation Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
