In [1]:
# Ignore sklearn future warning.
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from SentiCR.SentiCR import SentiCR
from SentiSW.code.classification.classifier import Classifier
from SentiSW.code.entity.training_set_generation import get_entity
import scikitplot as skplt
import matplotlib.pyplot as plt
import pandas as pd
import pickle

# Config NLTK java file finder.
import os
java_path = "C:/Program Files (x86)/Java/jre1.8.0_241/bin/java.exe"
os.environ['JAVAHOME'] = java_path


dir_path = os.path.abspath(os.getcwd())

# Classifier's model path
SentiCRModelPath = dir_path + "/SentiCR/SentiCR/models/SentiCR_model.pkl"
SentiSWModelPath = dir_path + "/SentiSW/data/model/sentimentClassification/classifier.pkl"

# Classifier's trainning files path
SentiCR_trainning_file_path = dir_path + '/SentiCR/oracle.xlsx'
SentiSW_trainning_file_path = dir_path + '/SentiSW/data/training_set_3000.csv'

def SentiSW_classify(text):
    if(os.path.exists(SentiSWModelPath)):
        sentiment_analyzer = Classifier(read=True, vector_method='tfidf')
        #sentiment_analyzer.save_model()
    else:
        sentiment_analyzer = Classifier(read=False, vector_method='tfidf')
    sentiment = sentiment_analyzer.get_sentiment_polarity_proba(text)[0]
    ret = {'sentiment': sentiment}
    if sentiment != 'Neutral':
    	entity = get_entity(text)
    	ret['entity'] = entity
    else:
    	ret['entity'] = None

    return ret

def SentiCR_classify(sentence):
    if(os.path.exists(SentiCRModelPath)):
        sentiment_analyzer = pickle.load(open(SentiCRModelPath, 'rb'))
    else:
        sentiment_analyzer = SentiCR.SentiCR()
        with open(SentiCRModelPath, 'wb') as model:
            pickle.dump(sentiment_analyzer, model)

    score = sentiment_analyzer.get_sentiment_polarity_probas(sentence)
    
    return score

# SentiCR dataset

In [2]:
sentiCR_df = pd.read_excel(SentiCR_trainning_file_path, names=['text', 'Annotation'])
sentiCR_df

Unnamed: 0,text,Annotation
0,- Should be like below:\ntextDirection = SWT.A...,0
1,"""""""create a vdsm.config.config clone, modified...",0
2,"""Add test(s) performing the static code analys...",0
3,"""apt-get"" is distro specific... perhaps make i...",0
4,"""easy"" is marketing; let the code speak for it...",0
...,...,...
1594,you'll need someone with some maven experience...,0
1595,Your memory is too smalll. Consider buying a R...,0
1596,You're preforming this check multiple times.\n...,0
1597,"You're right. Ivan,tenant_id is non-admin tena...",0


# SentiSW dataset

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 3000 entries, 0 to 2999

Data columns (total 3 columns):

    Column      Non-Null Count  Dtype 
    
 - 0   issue_id    3000 non-null   int64 
 - 1   Annotation  3000 non-null   object
 - 2   text        2993 non-null   object
 

First, we load the dataframe from the .csv file and remove rows containing NaN values in the 'text' column. This results in a dataframe containing 2993 rows, down from 3000.

In [3]:
sentiSW_df = pd.read_csv(SentiSW_trainning_file_path)
sentiSW_df.dropna(inplace=True)
sentiSW_df

Unnamed: 0,issue_id,Annotation,text
0,3001,Negative,I have a component with these options:\r\n\r\n...
1,3002,Neutral,### Version\r\n2.2.5\r\n\r\n### Reproduction l...
2,3003,Positive,### What problem does this feature solve?\r\nL...
3,3004,Neutral,I am trying to get some multi page setup simil...
4,3005,Neutral,"Hi,\r\n\r\nI am using [Vis Network](http://vis..."
...,...,...,...
2995,5996,Neutral,Yes. I definitely think we shoudl have an API...
2996,5997,Neutral,I think that this would complicate the languag...
2997,5998,Neutral,Another related issue is that we're not proper...
2998,5999,Positive,Here are the problems I am hearing about on th...


# SentiCR predictions on SentiSW's dataset.

In [None]:
sentiSW_train = sentiSW_df['text'].tolist()

sentiCR_predictions = []
for text in sentiSW_train:
    sentiCR_predictions.append(SentiCR_classify(text))

In [None]:
y_true = sentiSW_df['Annotation']

# Transforms SentiSW's labels ['Positive', 'Neutral', 'Negative'] to match SentiCR's binary classification ['Non-negative', 'Negative']
y_true = [x if x == 'Negative' else 'Non-negative' for x in y_true]

y_probas = [[sentiCR_predictions[i][0][0], sentiCR_predictions[i][0][1]] for i in range(len(sentiCR_predictions))]

skplt.metrics.plot_roc(y_true, y_probas)

fig = plt.gcf()

fig.set_size_inches(8, 8)
plt.savefig('a.png', dpi=100)

plt.show()

# SentiSW predictions on SentiCR's dataset

In [None]:
sentiCR_train = sentiCR_df['text'].tolist()

count = 0
sentiSW_predictions = []
for text in sentiCR_train:
    print(count)
    count += 1
    sentiSW_predictions.append(SentiSW_classify(text))

In [None]:
len(sentiSW_predictions)

with open('sentiSW_predictions.pkl', 'wb') as f:
    pickle.dump(sentiSW_predictions, f)

In [None]:
sentiSW_predictions = []
with open('sentiSW_predictions.pkl', 'rb') as f:
    sentiSW_predictions = pickle.load(f)

In [7]:
def convert_to_binary_prediction(preds):
    positive = max([preds[0], preds[1]])
    negative = preds[2]
    diff = abs(positive + negative - 1)
    
    positive += (diff/2)
    negative += (diff/2)
    
    return [positive, negative]

In [8]:
converted_predictions = [convert_to_binary_prediction(sentiSW_predictions[i]['sentiment']) for i in range(len(sentiSW_predictions))]
converted_predictions

[[0.839088714376771, 0.16091128562322898],
 [0.8351326964344495, 0.16486730356555052],
 [0.8351326964344495, 0.16486730356555052],
 [0.7691675120896524, 0.2308324879103477],
 [0.7721317531418607, 0.22786824685813933],
 [0.9209219793169179, 0.07907802068308203],
 [0.8952439999981994, 0.10475600000180059],
 [0.8351326964344495, 0.16486730356555052],
 [0.8351326964344495, 0.16486730356555052],
 [0.8508548488965764, 0.1491451511034236],
 [0.7520498473426174, 0.24795015265738266],
 [0.8138376912040478, 0.18616230879595225],
 [0.8495485505846112, 0.15045144941538885],
 [0.7831212411214041, 0.2168787588785958],
 [0.879457232306146, 0.12054276769385398],
 [0.7788973784261891, 0.2211026215738109],
 [0.7686149188254103, 0.23138508117458964],
 [0.8351326964344495, 0.16486730356555052],
 [0.8351326964344495, 0.16486730356555052],
 [0.8351326964344495, 0.16486730356555052],
 [0.8340733479965148, 0.16592665200348516],
 [0.7642727894161443, 0.23572721058385576],
 [0.839088714376771, 0.160911285623228

In [9]:
pos = 0
neu = 0
neg = 0
for i in range(len(sentiSW_predictions)):
    pred = sentiSW_predictions[i]['sentiment']
    if(pred[0] == max(pred)):
        pos += 1
    elif(pred[1] == max(pred)):
        neu += 1
    else:
        neg += 1
print("pos     ", pos)
print("neu     ", neu)
print("neg     ", neg)

pos      115
neu      1387
neg      97


In [16]:
for i in zip(sentiCR_df['Annotation'], sentiSW_predictions)[:10]:
    print(i)


TypeError: 'zip' object is not subscriptable

In [39]:
indexes = []
for i in sentiSW_predictions:
    indexes.append(max(range(len(i['sentiment'])), key=i['sentiment'].__getitem__))

a = list(zip(sentiCR_df['Annotation'], indexes))
a

[(0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (-1, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (-1, 1),
 (-1, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (-1, 1),
 (0, 1),
 (0, 1),
 (-1, 1),
 (0, 1),
 (0, 1),
 (-1, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (-1, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (-1, 1),
 (-1, 1),
 (0, 1),
 (0, 2),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (-1, 1),
 (0, 1),
 (-1, 0),
 (0, 1),
 (0, 1),
 (-1, 1),
 (-1, 1),
 (0, 1),
 (0, 1),
 (-1, 1),
 (0, 1),
 (-1, 1),
 (0, 2),
 (0, 1),
 (0, 2),
 (0, 1),
 (0, 1),
 (-1, 1),
 (0, 1),
 (0, 1),
 (0, 0),
 (0, 0),
 (0, 1),
 (-1, 0),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 0),
 (0, 1),
 (0, 1),
 (0, 1),
 (-1, 0),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (-1, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (-1, 1),
 (0, 1),
 (0, 1),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 0),

In [10]:
zero = 0
one = 0
for pred in converted_predictions:
    if(pred[0] > pred[1]):
        zero += 1
    else:
        one += 1
print("0     ", zero)
print("-1     ", one)

0      1502
-1      97


In [11]:
sentiCR_df['Annotation'].value_counts()

 0    1201
-1     398
Name: Annotation, dtype: int64

In [None]:
sentiCR_y_true = sentiCR_df['Annotation']

sentiCR_y_true = ['Negative' if x != 0 else 'Non-negative' for x in sentiCR_y_true]

converted_predictions = [convert_to_binary_prediction(sentiSW_predictions[i]['sentiment']) for i in range(len(sentiSW_predictions))]

skplt.metrics.plot_roc(sentiCR_y_true, converted_predictions)

fig = plt.gcf()

fig.set_size_inches(8, 8)
plt.savefig('sentiSW_Preds_on_SentiCR.png', dpi=100)

plt.show()

In [None]:
fig = plt.figure(figsize=(3,4))
fig.savefig('fig1.png', dpi = 300)