In [0]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import re
from nltk.stem import SnowballStemmer
from sklearn.metrics import log_loss

WNL = WordNetLemmatizer()
STOP_WORDS = stopwords.words("english")
REMOVE_STOPWORDS=True
STEM_WORDS=True
LEM_INSTEAD_OF_STEM=True

def cutter(word):
    if len(word) < 4:
        return word
    return WNL.lemmatize(WNL.lemmatize(word, "n"), "v")

def preprocess(string):
    # Convert words to lower case and clean the text
    string = string.lower().replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'") \
        .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not") \
        .replace("n't", " not").replace("what's", "what is").replace("it's", "it is") \
        .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are") \
        .replace("he's", "he is").replace("she's", "she is").replace("'s", " own") \
        .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ") \
        .replace("€", " euro ").replace("'ll", " will").replace("=", " equal ").replace("+", " plus ")
    string = re.sub(r"([0-9]+)000000", r"\1m", string)
    string = re.sub(r"([0-9]+)000", r"\1k", string)

    # Remove punctuation from text
    string = re.sub('[“”\(\'…\)\!\^\"\.;:,\-\?？\{\}\[\]\\/\*@]', ' ', string)

    # Optionally, remove stop words
    if REMOVE_STOPWORDS:
        string = string.split()
        string = [w for w in string if not w in STOP_WORDS]
        string = " ".join(string)

    # Optionally, shorten words to their stems or lemmatize
    if STEM_WORDS:
        if LEM_INSTEAD_OF_STEM:
          string = ' '.join([cutter(w) for w in string.split()])
        else:
          string = string.split()
          stemmer = SnowballStemmer('english')
          stemmed_words = [stemmer.stem(word) for word in string]
          string = " ".join(stemmed_words)

    return string

print("Loading data...")
train = pd.read_csv("train_original.csv")
unprocessed_qid1=train["question1"].copy()
unprocessed_qid2=train["question2"].copy()

print("Clearing text...")
train["question1"] = train["question1"].fillna("").apply(preprocess)
train["question2"] = train["question2"].fillna("").apply(preprocess)

Loading data...
Clearing text...


In [0]:
import random
rn=random.randint(0, 100)
print("For example, the question \n",unprocessed_qid1[rn], "\nbecame \n", train["question1"][rn], "",sep='')

For example, the question 
Can I make 50,000 a month by day trading?
became 
make 50k month day trade


In [0]:
#FOR PART A

def common_words_ratio(q1,q2):
  q1s = q1.split()
  q2s = q2.split()
  
  avg_length = (len(q1s)+len(q2s))/2
  if avg_length==0:
    return 0
  common_count=0
  for word in q1s:
    for wordy in q2s:
      if word==wordy:
        common_count=common_count+1
        break
        
  return common_count/avg_length

print("Making list of common words between 2 questions...")
common_words=[]
for index, row in train.iterrows():
  common_words.append(common_words_ratio(row["question1"], row["question2"]))

print("Training model...")
from sklearn.model_selection import train_test_split
common_words = np.array(common_words).reshape(-1, 1)
labels = np.array(train["is_duplicate"])
X_train, X_test, y_train, y_test = train_test_split(common_words, labels, test_size=0.2, random_state=0)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=30, random_state=21)
model.fit(X_train, y_train)
print("Accuracy on test set: {:.3f}".format(model.score(X_test, y_test)))

pred = model.predict_proba(X_test)
metric_1 = log_loss(y_test, pred)
print ("Logloss on test set: {:.3f}".format(metric_1))

Making list of common words between 2 questions...
Training model...
Accuracy on test set: 0.679
Logloss on test set: 0.536


In [0]:
#FOR PART B

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

corpus=pd.concat([train["question1"],train["question2"]])
cv = CountVectorizer().fit(corpus)

print("Calculating cosine similarities between questions...")
cosine_similarities=[]
size=train.shape[0]
for j in range(size):
    a = cv.transform([train.iloc[j].question1])
    b = cv.transform([train.iloc[j].question2])
    cosine_similarities.append(cosine_similarity(a,b).ravel()[0])
    
border=0.50
print("Calculating accuracy for border",border,"...")
#Labels are only used to find accuracy
correct_predictions=0
for j in range(size):
  if (cosine_similarities[j]>=border and train.iloc[j].is_duplicate==1) or (cosine_similarities[j]<border and train.iloc[j].is_duplicate==0):
    correct_predictions=correct_predictions+1
print("For border",border,"accuracy is: {:.4f}".format(correct_predictions/size))

Calculating cosine similarities between questions...
Calculating accuracy for border 0.5 ...
For border 0.5 accuracy is: 0.6348


In [0]:
#FOR PART C

print("Creating list of unique questions...")
list_of_unique_questions = [0] * 537934 #537933 unique questions

n_of_pairs=len(train)
for i in range(n_of_pairs):
  
  if list_of_unique_questions[train["qid1"][i]]==0:
    list_of_unique_questions[train["qid1"][i]]=train["question1"][i]
    
  if list_of_unique_questions[train["qid2"][i]]==0:
    list_of_unique_questions[train["qid2"][i]]=train["question2"][i]
    
print("Creating list of unique unprocessed questions...")
list_of_unique_unprocessed_questions = [0] * 537934 #537933 unique questions

n_of_pairs=len(train)
for i in range(n_of_pairs):
  
  if list_of_unique_unprocessed_questions[train["qid1"][i]]==0:
    list_of_unique_unprocessed_questions[train["qid1"][i]]=unprocessed_qid1[i]
    
  if list_of_unique_unprocessed_questions[train["qid2"][i]]==0:
    list_of_unique_unprocessed_questions[train["qid2"][i]]=unprocessed_qid2[i]
    
print("Creating dictionary of words (inverted index)...")
inverted = {}
for i in range(1,len(list_of_unique_questions)):
  words = list_of_unique_questions[i].split()    
  for word in words:
    inverted.setdefault(word, [])
    if i not in inverted[word]:
      inverted[word].append(i)

#Find how many times the most common word appears
max=0
for word,documents in inverted.items():
  count=len(documents)
  if count>max:
    max=count
    
#Normalization of every word, common words count less!
for word,documents in inverted.items():
  count=len(documents)
  zi=(count-1)/(max)
  documents.append(1-zi)

Creating list of unique questions...
Creating list of unique unprocessed questions...
Creating dictionary of words (inverted index)...


In [0]:
import datetime
a = datetime.datetime.now()

#Find all documents that contain at least one word from the question
testing_phrase="how to be a better programmer?"
print(testing_phrase)
testing_phrase=preprocess(testing_phrase)
words = testing_phrase.split()
similar_docs=set()

for word in words:
  if word in inverted:
    for doc in inverted[word][:-1]:
      similar_docs.add(doc)

#Find the most relevant documents, depending on how common a word is and how long is the question
question_length=len(testing_phrase)
similar_docs_points=[]
similar_docs_texts=[]
for doc in similar_docs:
  words = set(list_of_unique_questions[doc].split())
  points=0
  for word in words:
    if word in testing_phrase:
      points+=inverted[word][-1]*(1/len(words))
  similar_docs_points.append(points)
  similar_docs_texts.append(list_of_unique_unprocessed_questions[doc])

number_of_elements = 10
Z = [x for _,x in sorted(zip(similar_docs_points,similar_docs_texts))]
Z.reverse()

b = datetime.datetime.now()
c = b - a
print("Found these questions in ", c.seconds, ".",c.microseconds," seconds:\n", sep='')
for x in Z[:number_of_elements]:
   print(x)

how to be a better programmer?
Found these questions in 0.56920 seconds:

What is a Programmer?
How can I be a better programmer?
What should I do to be better than better?
How can I better myself?
How do top programmers programm?
What do programmers program all the time?
Is Python better than R?
How can I be a better programmer / developer?
Which is better, B.A or B.Sc?
Which Programming language is better?
