In [None]:
import pandas as pd
import numpy as np
import nltk
import os
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('words')
nltk.download('stopwords')
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#Importing Datasets

In [None]:
questions = pd.read_csv("/content/drive/MyDrive/Colab Files/TM-Project/Phase3/PPnewQuestionsPH3.csv")
comments = pd.read_csv("/content/drive/MyDrive/Colab Files/TM-Project/Phase3/PPnewCommentsPH3.csv")
answers = pd.read_csv("/content/drive/MyDrive/Colab Files/TM-Project/Phase3/PPnewAnswersPH3.csv")

In [None]:
with open('/content/drive/MyDrive/Colab Files/TM-Project/Phase3/positive-words.txt', errors='ignore') as f:
    positives1 = f.readlines()
with open('/content/drive/MyDrive/Colab Files/TM-Project/Phase3/negative-words.txt', errors='ignore') as f:
    negatives1 = f.readlines()
negatives = []
positives = []
for i in negatives1:
  negatives.append(i[:-1])
for i in positives1:
  positives.append(i[:-1])

#PreProcessing Datasets

In [None]:
words = set(nltk.corpus.words.words())
stop_words = set(stopwords.words('english'))
negativeMakers = ["don't", "dont", "isn't", "isnt", "wouldn't", "wouldnt",  "couldn't",  "couldnt",  "weren't",  "werent"
,"doesn't", "doesnt", "weren't", "werent", "aren't", "arent", "didn't"," didnt", "aren't", "arent", "shan't", "shant"
,"hadn't", "hadnt", "won't", "wont", "mustn't","mustnt" ,"not" ,"needn't", "neednt", "mightn't", "mightnt", "no"
,"wasn't", "wasnt", "nor", "neither"]
thanks = ['thanks', 'thank', 'tanks', 'tnx']
dontRemove = [i for i in negativeMakers]
for j in thanks:
  dontRemove.append(j)
dontRemove.append("but")
stopWords = [ w for w in stop_words if w not in dontRemove]

##Comments

In [None]:
commentsWordsList =[]
ppComments = []
for c in comments['PPText'].to_numpy():
  if(type(c) != float):
    pprow = []
    for i in c.split():
      if len(i) > 0 and i not in stopWords:
        if i in words or i in negativeMakers or i in thanks:
          pprow.append(i)
          commentsWordsList.append(i)
    ppComments.append(pprow)
  else:
    ppComments.append([])

commentsWordsCount = len(commentsWordsList)

##Answers

In [None]:
answersWordsList =[]
ppAnswers = []
for a in answers['PPText'].to_numpy():
  if(type(a) != float):
    pprow = []
    for i in a.split():
      if len(i) > 0 and i not in stopWords:
        if i in words or i in negativeMakers or i in thanks:
          pprow.append(i)
          answersWordsList.append(i)
    ppAnswers.append(pprow)
  else:
    ppAnswers.append([])

answersWordsCount = len(answersWordsList)

##Questions

In [None]:
questionsWordsList =[]
ppQuestions = []
for q in questions['PPText'].to_numpy():
  if(type(q) != float):
    pprow = []
    for i in q.split():
      if len(i) > 0 and i not in stopWords:
        if i in words or i in negativeMakers or i in thanks:
          pprow.append(i)
          questionsWordsList.append(i)
    ppQuestions.append(pprow)
  else:
    ppQuestions.append([])

questionsWordsCount = len(questionsWordsList)

#Functions

In [None]:
def makeDiscrete(inp):
  inp = pd.DataFrame({"inp":inp})
  trans = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='kmeans') #{‘uniform’, ‘quantile’, ‘kmeans’}
  data_trans = trans.fit_transform(inp)
  result = []
  for d in data_trans:
    result.append(d[0])
  return result

#Feature Extraction

##Number of answers

In [None]:
answersQuestionID = answers['ParentId(QuestionId)'].to_numpy()
unique, counts = np.unique(answersQuestionID, return_counts=True)
countsdict = dict(zip(unique, counts))
nAnswers = []
for a in answersQuestionID:
  nAnswers.append(countsdict[a])

##Words count

###Answers

In [None]:
wCountAnswers = []
for a in ppAnswers:
  wCountAnswers.append(len(a))

In [None]:
#wCountA = makeDiscrete(wCountAnswers)
wCountA = wCountAnswers

###Questions

In [None]:
wCountQuestions = []
for a in range(len(ppAnswers)):
  wCountQuestions.append(len(ppQuestions[questions.loc[ questions['Id'] == answers.iloc[a,1] ].index.tolist()[0]]))

In [None]:
#wCountQ = makeDiscrete(wCountQuestions)
wCountQ = wCountQuestions

###Comments

In [None]:
wCountComments = []
for a in answers['Id'].to_numpy():
  commentsIndex = comments.loc[ comments['PostId(AnswerId)'] == a].index.tolist()
  count = 0
  for i in commentsIndex:
    count += len(ppComments[i])
  wCountComments.append(count)

In [None]:
#wCountC = makeDiscrete(wCountComments)
wCountC = wCountComments

##Comments count

In [None]:
nComments = []
for a in answers['Id'].to_numpy():
  nComments.append(len(comments.loc[ comments['PostId(AnswerId)'] == a]))

##Sentiment

In [None]:
#Sentiments extracted by Sentid4sd "https://github.com/collab-uniba/Senti4SD"
path = "/content/drive/MyDrive/Colab Files/TM-Project/Phase3/Predict(Answer)"
sentiment = []
for file in os.listdir(path):
  file_path = f"{path}/{file}"
  csv = pd.read_csv(file_path)
  for i in range(len(csv)):
    sent = csv.loc[csv['Row'] == 't'+str(i), 'Predicted'].iloc[0]
    if sent == "positive":
      sentiment.append(0)
    elif sent == "negative":
      sentiment.append(1)
    else:
      sentiment.append(2)

##Thanks Included

In [None]:
thanksIncluded = np.zeros(len(ppAnswers))
answersIdList = answers['Id'].to_numpy()
for a in range(len(ppAnswers)):
  commentsIndex = comments.loc[ comments['PostId(AnswerId)'] == answersIdList[a]].index.tolist()
  flag = 0
  for i in commentsIndex:
    theComment = ppComments[i]
    for t in thanks:
      if t in theComment:
        thanksIncluded[a] = 1
        flag = 1
        break
    if flag == 1:
      break

##But Included

In [None]:
butIncluded = np.zeros(len(ppAnswers))
answersIdList = answers['Id'].to_numpy()
for a in range(len(ppAnswers)):
  commentsIndex = comments.loc[ comments['PostId(AnswerId)'] == answersIdList[a]].index.tolist()
  for i in commentsIndex:
    theComment = ppComments[i]
    if 'but' in theComment:
      butIncluded[a] = 1
      break

##Negative maker words count

In [None]:
nNegative = []
answersIdList = answers['Id'].to_numpy()
for a in range(len(ppAnswers)):
  commentsIndex = comments.loc[ comments['PostId(AnswerId)'] == answersIdList[a]].index.tolist()
  count = 0
  for i in commentsIndex:
    theComment = ppComments[i]
    for n in negativeMakers:
      if n in theComment:
        count += 1
  if wCountComments[a] != 0:
    nNegative.append(count / wCountComments[a])
  else:
    nNegative.append(0)

In [None]:
#nMakers = makeDiscrete(nNegative)
nMakers = nNegative

##P(y)/Words count

###Comments

In [None]:
pyPositive = []
pyNegative = []
existingPositives = []
existingNegatives = []
for y in positives:
  count = commentsWordsList.count(y)
  if count != 0:
    pyPositive.append( round((count / commentsWordsCount) * 1000000 , 2) )
    existingPositives.append(y)
for y in negatives:
  count = commentsWordsList.count(y)
  if count != 0:
    pyNegative.append( round((count / commentsWordsCount) * 1000000 , 2) )
    existingNegatives.append(y)


In [None]:
pyComments = []
answersIdList = answers['Id'].to_numpy()
for a in range(len(answersIdList)):
  commentsIndex = comments.loc[ comments['PostId(AnswerId)'] == answersIdList[a]].index.tolist()
  count = 0
  for i in commentsIndex:
    theComment = ppComments[i]
    if len(theComment) != 0:
      for pos in range(len(existingPositives)):
        if existingPositives[pos] in theComment:
          count += pyPositive[pos]
      for neg in range(len(existingNegatives)):
        if existingNegatives[neg] in theComment:
          count -= pyNegative[neg]
  if count != 0:
    pyComments.append(count/ wCountComments[a])
  else:
    pyComments.append(0)

In [None]:
# If i want to remove this, I have to add Round in pyComments.append(count/ wCountComments[a])
#pyC = makeDiscrete(pyComments)
pyC = pyComments

###Answers

In [None]:
pyPositive = []
pyNegative = []
existingPositives = []
existingNegatives = []
for y in positives:
  count = answersWordsList.count(y)
  if count != 0:
    pyPositive.append( round((count / answersWordsCount) * 1000000 , 2) )
    existingPositives.append(y)
for y in negatives:
  count = answersWordsList.count(y)
  if count != 0:
    pyNegative.append( round((count / answersWordsCount) * 1000000 , 2) )
    existingNegatives.append(y)

In [None]:
pyAnswers = []
for a in range(len(ppAnswers)):
  count = 0
  theAnswer = ppAnswers[a]
  for pos in range(len(existingPositives)):
    if existingPositives[pos] in theAnswer:
      count += pyPositive[pos]
  for neg in range(len(existingNegatives)):
    if existingNegatives[neg] in theAnswer:
      count -= pyNegative[pos]
  if count != 0:
    pyAnswers.append(count/ wCountAnswers[a])
  else:
    pyAnswers.append(0)

In [None]:
# If i want to remove this, I have to add Round in pyComments.append(count/ wCountComments[a])
#pyA = makeDiscrete(pyAnswers)
pyA = pyAnswers

##Wordnet Similarity

###Questions and Answers

In [None]:
allSynsets = {}
for a in np.unique(np.concatenate((questionsWordsList, answersWordsList, commentsWordsList))):
  syns = wn.synsets(a)
  if len(syns) == 0:
    allSynsets[a] = 0
  else:
    allSynsets[a] = syns[0]

In [None]:
simDict = {}
answersParentIdList = answers['ParentId(QuestionId)'].to_numpy()
questionAnswerSim = []
for a in range(len(answersParentIdList)):
  theQuestion = ppQuestions[questions.loc[ questions['Id'] == answersParentIdList[a]].index.tolist()[0]]
  theAnswer = ppAnswers[a]
  wordsCount = len(theQuestion) + len(theAnswer)
  wup_sim = 0
  for qw in theQuestion:
    for aw in theAnswer:
      if simDict.get(qw+aw) == None:
        if simDict.get(aw+qw) == None:
          first = allSynsets[qw]
          if type(first) != int:
            second = allSynsets[aw]
            if type(second) != int:
              sim = first.wup_similarity(second)
              if sim != None:
                simDict[qw+aw] = sim
                simDict[aw+qw] = sim
                wup_sim += sim
              else:
                simDict[qw+aw] = 0
                simDict[aw+qw] = 0
        else:
          wup_sim += simDict[aw+qw]
      else:
        wup_sim += simDict[qw+aw]
  questionAnswerSim.append(wup_sim/wordsCount)

In [None]:
# If i want to remove this, I have to add Round
#QAsim = makeDiscrete(questionAnswerSim)
QAsim = questionAnswerSim

###Answers and Comments

In [None]:
answersIdList = answers['Id'].to_numpy()
answerCommentSim = []
for a in range(len(answersIdList)):
  CommentsIndex = comments.loc[ comments['PostId(AnswerId)'] == answersIdList[a]].index.tolist()
  theAnswer = ppAnswers[a]
  wordsCount = len(theAnswer) + wCountComments[a]
  wup_sim = 0
  for i in CommentsIndex:
    theComment = ppComments[i]
    for qw in theComment:
      for aw in theAnswer:
        if simDict.get(qw+aw) == None:
          if simDict.get(aw+qw) == None:
            first = allSynsets[qw]
            if type(first) != int:
              second = allSynsets[aw]
              if type(second) != int:
                sim = first.wup_similarity(second)
                if sim != None:
                  simDict[qw+aw] = sim
                  simDict[aw+qw] = sim
                  wup_sim += sim
                else:
                  simDict[qw+aw] = 0
                  simDict[aw+qw] = 0
          else:
            wup_sim += simDict[aw+qw]
        else:
          wup_sim += simDict[qw+aw]
  answerCommentSim.append(wup_sim/wordsCount)

In [None]:
# If i want to remove this, I have to add Round
#ACsim = makeDiscrete(answerCommentSim)
ACsim = answerCommentSim

#Labeling

In [None]:
acceptedsId = questions['AcceptedAnswerId'].to_numpy()
answersId = answers['Id'].to_numpy()
label = []
for a in answersId:
  if a in acceptedsId:
    label.append(1)
  else:
    label.append(0)

#Machine learning

In [None]:
#dataset = pd.DataFrame({'NoA':nAnswers, 'WCA': wCountA, 'WCQ': wCountQ, 'WCC': wCountC, 'CC': nComments,
                        #'tnx': thanksIncluded, "Senti": sentiment, 'but': butIncluded, 'NMC': nMakers,
                        #'PYC': pyC, 'PYA': pyA, 'SQA': QAsim, 'SAC': ACsim, 'Label': label})
#mamuli uniform 69.5 ada
#dataset = pd.DataFrame({'NoA':nAnswers, 'CC': nComments,
                        #'tnx': thanksIncluded, 'but': butIncluded,
                        #'Label': label})
#quantile 70% ada and random and dt
#dataset = pd.DataFrame({'NoA':nAnswers, 'WCA': wCountA, 'CC': nComments,
                        #'tnx': thanksIncluded,'but': butIncluded, 'NMC': nMakers,
                        #'Label': label})

##Feature Selection

In [None]:
dataset = pd.DataFrame({'NoA':nAnswers, 'WCA': wCountA, 'WCQ': wCountQ, 'WCC': wCountC, 'CC': nComments,
                        'tnx': thanksIncluded, "Senti": sentiment, 'but': butIncluded, 'NMC': nMakers,
                        'PYC': pyC, 'PYA': pyA, 'SQA': QAsim, 'SAC': ACsim, 'Label': label})
features = dataset.iloc[:, :-1]
labels = dataset.iloc[:, -1]

In [None]:
#save dataset to use later
dataset.to_csv("/content/drive/MyDrive/Colab Files/TM-Project/Phase3/ds.csv", index=False)

In [None]:
ds = pd.read_csv("/content/drive/MyDrive/Colab Files/TM-Project/Phase3/ds.csv")
dataset = ds
#dataset = ds[['NoA', 'WCA', 'CC', 'tnx', 'PYC', 'NMC', 'Label']]
features = dataset.iloc[:, :-1]
labels = dataset.iloc[:, -1]

In [None]:
#dataset = pd.DataFrame({'NoA':nAnswers, 'WCA': wCountA, 'CC': nComments,
                        #'tnx': thanksIncluded, 'PYC': pyC, 'NMC': nMakers,
                        #'Label': label})
#features = dataset.iloc[:, :-1]
#labels = dataset.iloc[:, -1]

##KNN

In [None]:
knn = KNeighborsClassifier(algorithm='auto',
                           metric='minkowski',
                           metric_params=None,
                           n_neighbors=8,
                           p=2)
#Cross validation
predicted = cross_val_predict(knn,features,labels,cv=10 ,n_jobs=-1)
print(metrics.classification_report(labels, predicted))

              precision    recall  f1-score   support

           0       0.64      0.81      0.72     21813
           1       0.58      0.37      0.45     15462

    accuracy                           0.63     37275
   macro avg       0.61      0.59      0.59     37275
weighted avg       0.62      0.63      0.61     37275



##Random Forest

In [None]:
Random_Forest_model = RandomForestClassifier(n_estimators=100,criterion="entropy", random_state=0)
#Cross validation
predicted = cross_val_predict(Random_Forest_model,features,labels,cv=10 ,n_jobs=-1)
print(metrics.classification_report(labels, predicted))

              precision    recall  f1-score   support

           0       0.72      0.78      0.75     21813
           1       0.65      0.57      0.61     15462

    accuracy                           0.69     37275
   macro avg       0.68      0.68      0.68     37275
weighted avg       0.69      0.69      0.69     37275



##Ada Boost

In [None]:
AdaBoost = AdaBoostClassifier(n_estimators=100, random_state = 0)
#Cross validation
predicted = cross_val_predict(AdaBoost,features,labels,cv=10 ,n_jobs=-1)
print(metrics.classification_report(labels, predicted))

              precision    recall  f1-score   support

           0       0.72      0.82      0.77     21813
           1       0.68      0.55      0.61     15462

    accuracy                           0.71     37275
   macro avg       0.70      0.68      0.69     37275
weighted avg       0.70      0.71      0.70     37275



##Decision Tree

In [None]:
DT = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
predicted = cross_val_predict(DT,features,labels,cv=10 ,n_jobs=-1)
print(metrics.classification_report(labels, predicted))

              precision    recall  f1-score   support

           0       0.67      0.67      0.67     21813
           1       0.54      0.55      0.54     15462

    accuracy                           0.62     37275
   macro avg       0.61      0.61      0.61     37275
weighted avg       0.62      0.62      0.62     37275



##SVM

In [None]:
svm = SVC(kernel='rbf')
predicted = cross_val_predict(svm,features,labels,cv=10 ,n_jobs=-1)
print(metrics.classification_report(labels, predicted))

              precision    recall  f1-score   support

           0       0.67      0.79      0.72     21813
           1       0.60      0.46      0.52     15462

    accuracy                           0.65     37275
   macro avg       0.64      0.62      0.62     37275
weighted avg       0.64      0.65      0.64     37275



##Naive Bayes Classifier

In [None]:
Gaussian = GaussianNB()
predicted = cross_val_predict(Gaussian,features,labels,cv=10 ,n_jobs=-1)
print(metrics.classification_report(labels, predicted))

              precision    recall  f1-score   support

           0       0.68      0.81      0.74     21813
           1       0.64      0.48      0.54     15462

    accuracy                           0.67     37275
   macro avg       0.66      0.64      0.64     37275
weighted avg       0.66      0.67      0.66     37275

