In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC  

In [13]:
df = pd.read_csv('/Users/manikhossain/Downloads/Py_R_Data/datasets.csv')
df 

Unnamed: 0,ID,NoD,TAP,LCP,DC,BC,CC,TZ,CD,RCD,...,TFP,TFN,OSE,BCE,PDE,SV,OS,SDS,RS,TFS
0,apache/mahut,44,15.95,2.2727,0.008447,2.550000e-04,22.00,10,5,11.36360,...,45.19,5,1,0,0,0,0,1,0,0
1,apache/cassandra,348,7.30,0.2873,0.002180,3.000000e-08,194.00,20,22,6.32100,...,49.70,9,1,1,1,0,1,1,0,0
2,apache/lucene-solr,181,12.83,0.5500,0.003280,2.900000e-07,105.00,19,25,13.81215,...,47.31,10,1,1,0,0,1,1,0,0
3,apache/pig,32,23.13,3.1250,0.204000,2.000000e-05,19.00,1,5,15.62500,...,46.31,2,1,1,0,0,0,1,0,0
4,apache/jackrabbit,49,22.00,429.0000,0.130000,6.500000e-06,29.00,1,9,18.39000,...,33.97,3,1,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,apacheranger,67,20.51,0.6000,0.009500,2.200000e-06,41.99,13,18,26.00000,...,44.45,5,0,1,0,0,0,1,0,1
70,kdek3b,148,15.90,0.2800,0.005348,2.900000e-07,77.00,11,4,2.70000,...,35.96,1,1,0,1,0,1,0,0,1
71,Apcheclimate,56,18.47,0.7000,20.800000,9.300000e-05,24.70,9,2,3.00000,...,41.81,1,1,0,0,0,0,0,0,1
72,Directmemory,10,22.13,6.4000,0.031200,4.900000e-03,7.37,1,7,70.00000,...,42.69,2,1,1,0,0,0,0,0,1


In [25]:
def mainModels():
    df = pd.read_csv('/Users/manikhossain/Downloads/Py_R_Data/datasets.csv')
    smells = ['OSE', 'BCE', 'PDE', 'SV', 'OS', 'SDS', 'RS', 'TFS']

    model = Models()
    for smell in smells:
        print('### Predicting code smell "{}" ###'.format(smell))
        model.predict(df, smell)

        
        
class Models():

    def predict(self, df, feature):
        # train test split
        X = df.iloc[:, 1:24]
        y = df[[feature]]
        X_train, X_test, y_train, y_test = train_test_split(X, y,train_size = 0.67, 
                                                            test_size = 0.33, random_state = 42)

        # Naive Bayes
        print('-- Naive Bayes --')
        clf = GaussianNB()
        self.output_accuracy(X, y, X_train, y_train, X_test, y_test, clf)

        # Random Forests
        print('-- Random Forests --')
        clf = RandomForestClassifier(
            n_estimators=100, max_depth=2, random_state=0)
        self.output_accuracy(X, y, X_train, y_train, X_test, y_test, clf)

        # C4.5 (J48)
        print('-- C4.5 (implented as J48 in Weka) --')
        clf = DecisionTreeClassifier(random_state=0)
        self.output_accuracy(X, y, X_train, y_train, X_test, y_test, clf)

        # Support Vector Machine using LIBSVM implementation with SMO
        print('-- Support Vector Machine using LIBSVM implementation with SMO --')
        clf = SVC(gamma='auto')
        self.output_accuracy(X, y, X_train, y_train, X_test, y_test, clf)

    def output_accuracy(self, X, y, X_train, y_train, X_test, y_test, clf):
        clf.fit(X_train, y_train.values.ravel()) 
        train_acc = clf.score(X_train, y_train.values.ravel())
        test_acc = clf.score(X_test, y_test.values.ravel())
        cv_scores = cross_val_score(clf, X, y.values.ravel(), cv=5)
        cv_acc = '{} (+/- {})'.format(cv_scores.mean(), cv_scores.std() * 2)
        print('Training accuray: {}'.format(train_acc))
        print('Testing accuray: {}'.format(test_acc))
        print('Cross-validation accuray: {}'.format(cv_acc))
        print('\n')
        
if __name__ == '__main__':
    mainModels()
    

### Predicting code smell "OSE" ###
-- Naive Bayes --
Training accuray: 0.6938775510204082
Testing accuray: 0.52
Cross-validation accuray: 0.40476190476190477 (+/- 0.26255330956362327)


-- Random Forests --
Training accuray: 0.9795918367346939
Testing accuray: 0.88
Cross-validation accuray: 0.8238095238095238 (+/- 0.13866875979581944)


-- C4.5 (implented as J48 in Weka) --
Training accuray: 1.0
Testing accuray: 0.92
Cross-validation accuray: 0.9600000000000002 (+/- 0.10666666666666665)


-- Support Vector Machine using LIBSVM implementation with SMO --
Training accuray: 1.0
Testing accuray: 0.84
Cross-validation accuray: 0.7971428571428572 (+/- 0.011428571428571477)


### Predicting code smell "BCE" ###
-- Naive Bayes --
Training accuray: 0.6938775510204082
Testing accuray: 0.64
Cross-validation accuray: 0.7 (+/- 0.2887340418772533)


-- Random Forests --
Training accuray: 0.9387755102040817
Testing accuray: 0.88
Cross-validation accuray: 0.9035714285714287 (+/- 0.07609741340236618)
