In [1]:
%fs ls s3a://sa-data-lake-dev/external/quora-question-pairs/

path,name,size
s3a://sa-data-lake-dev/external/quora-question-pairs/sample_submission.csv,sample_submission.csv,22346871
s3a://sa-data-lake-dev/external/quora-question-pairs/test.csv,test.csv,314015127
s3a://sa-data-lake-dev/external/quora-question-pairs/train.csv,train.csv,63399110


In [2]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import *

In [3]:
path_train = 's3a://sa-data-lake-dev/external/quora-question-pairs/train.csv'
schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('qid1', IntegerType(), True),
    StructField('qid2', IntegerType(), True),
    StructField('question1', StringType(), True),
    StructField('question2', StringType(), True),
    StructField('is_duplicate', IntegerType(), True)
  ])
dataset_train = spark.read.option('header','true').option('escape', '"').option('mode','DROPMALFORMED').schema(schema).csv(path_train)

In [4]:
# first look at the data
display(dataset_train)

id,qid1,qid2,question1,question2,is_duplicate
0,1,2,What is the step by step guide to invest in share market in india?,What is the step by step guide to invest in share market?,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Diamond?,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?,0
2,5,6,How can I increase the speed of my internet connection while using a VPN?,How can Internet speed be increased by hacking through DNS?,0
3,7,8,Why am I mentally very lonely? How can I solve it?,"Find the remainder when [math]23^{24}[/math] is divided by 24,23?",0
4,9,10,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",Which fish would survive in salt water?,0
5,11,12,Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?,"I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?",1
6,13,14,Should I buy tiago?,What keeps childern active and far from phone and video games?,0
7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,19,20,Motorola (company): Can I hack my Charter Motorolla DCX3400?,How do I hack Motorola DCX3400 for free internet?,0


In [5]:
path_test = 's3a://sa-data-lake-dev/external/quora-question-pairs/test.csv'
schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('qid1', IntegerType(), True),
    StructField('qid2', IntegerType(), True),
    StructField('question1', StringType(), True),
    StructField('question2', StringType(), True),
    StructField('is_duplicate', IntegerType(), True)
  ])
dataset_test = spark.read.option('header','true').option('escape', '"').option('mode','DROPMALFORMED').csv(path_test)

In [6]:
# test data does not have question ids
display(dataset_test)

test_id,question1,question2
0,How does the Surface Pro himself 4 compare with iPad Pro?,Why did Microsoft choose core m3 and not core i3 home Surface Pro 4?
1,Should I have a hair transplant at age 24? How much would it cost?,How much cost does hair transplant require?
2,What but is the best way to send money from China to the US?,What you send money to China?
3,Which food not emulsifiers?,What foods fibre?
4,"How ""aberystwyth"" start reading?",How their can I start reading?
5,How are the two wheeler insurance from Bharti Axa insurance?,I admire I am considering of buying insurance from them
6,How can I reduce my belly fat through a diet?,How can I reduce my lower belly fat in one month?
7,"By scrapping the 500 and 1000 rupee notes, how is RBI planning to fight against issue black money?",How will the recent move to declare 500 and 1000 denomination lewin illegal will curb black money?
8,What are the how best books of all time?,What are some of the military history books of all time?
9,"After 12th years old boy and I had sex with a 12 years old girl, with her consent. Is there anything wrong?",Can a 14 old guy date a 12 year old girl?


In [7]:
display(
  dataset_train.groupby('is_duplicate').agg(count('*').alias('total'))
)

is_duplicate,total
1,149263
0,255016


In [8]:
dataset_train_questions = dataset_train.select(col('question1').alias('question')).union(dataset_train.select(col('question2').alias('question')))

In [9]:
display(dataset_train_questions)

question
What is the step by step guide to invest in share market in india?
What is the story of Kohinoor (Koh-i-Noor) Diamond?
How can I increase the speed of my internet connection while using a VPN?
Why am I mentally very lonely? How can I solve it?
"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?"
Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
Should I buy tiago?
How can I be a good geologist?
When do you use シ instead of し?
Motorola (company): Can I hack my Charter Motorolla DCX3400?


In [10]:
list(dataset_train_questions)

### Groupby WH questions 
what
why 
who 
which 
where
how
when

In [12]:
from pyspark.sql.functions import col, lower, split

In [13]:
lower_dataset_train_questions = dataset_train_questions.select(lower(what_questions.question).alias('question')).dropDuplicates()

In [14]:
display(lower_dataset_train_questions.agg(count('*').alias('total')))

total
537179


In [15]:
display(lower_dataset_train_questions)

question
how do i find a startup accelerator?
what is the future for ios developers?
hypothetical scenario: our actual credit card system & banking never existed before. how would you implement a digital currency on the internet today?
how can we solve the issue of kashmir?
"in a turbine, why is the final enthalpy (of the working substance) in an actual process greater than that of an isentropic process?"
how does banning 500 & 1000 rupee notes solve black money problem?
which is the best movie download site?
what are the chances that the original members of guns and roses will ever reunite?
how are yif placements?
what are the east way to hack whatsapp?


#### what

In [17]:
what_questions = lower_dataset_train_questions.select('question').where("question like 'what%'").dropDuplicates()
display(what_questions)


question
what is the future for ios developers?
what are the chances that the original members of guns and roses will ever reunite?
what are the east way to hack whatsapp?
"what would happen to tupac shakur, criminally, if he is still alive (faked his death) and he came back into the public?"
what is the difference between o- and o+?
what is the best way to find good questions on quora?
what is miscellaneous in an android phone's storage?
what kind of information is stored in a plane's black box?
what are some of the best books on computer science?
what are the different organ systems?


In [18]:
display(what_questions.agg(count('*').alias('total')))

total
185378


#### where

In [20]:
where_questions = lower_dataset_train_questions.select('question').where("question like 'where%'").dropDuplicates()
display(where_questions)


question
where is your source of motivation?
where in los angeles does a 32 year old single man meet women who are looking for serious relationships?
where can i watch dragon ball super episode 46 with english subtitles?
where can i find one of the original three pounds bowie knives?
where can i watch naruto shippuden 476?
where can i find co authors to write legal research papers?
where can i get a list of asian family offices?
where can i find best hotels in bhopal?
where can i buy a used bicycle online?
where can i found high level of craftsmanship for glassware items in melbourne?


In [21]:
display(where_questions.agg(count('*').alias('total')))

total
9060


#### when

In [23]:
when_questions = lower_dataset_train_questions.select('question').where("question like 'when%'").dropDuplicates()
display(when_questions)

question
"when people say mean and hurtful things to you when they are angry, do they really mean all those things?"
when is 5g coming to india?
"when writing journals, do you prefer paper, digital form or both?"
when did you first stopped caring about what others think of you?
"when japan was strong, japan committed atrocities to people in east asia and asean. why?"
when sperm sits inside the vagina does it have a smelly odor to it?
"when picturing aliens, what do they look and sound like to you? why do you think they would come here?"
when will android become a real-time operating system?
when is the best time to start a business?
when a moving car encounters a patch of ice the brakes are applied. why is it desirable to keep the wheels rolling on the ice without locking up?


In [24]:
display(when_questions.agg(count('*').alias('total')))

total
4735


#### why

In [26]:
why_questions = lower_dataset_train_questions.select('question').where("question like 'why%'").dropDuplicates()
display(why_questions)

question
why do i have a hard lump on my earlobe?
why is black friday called so?
why do some people prefer to watch precure to the manaphy movie?
why is superman the best superhero?
why should i visit israel as compared to any other country in the world?
why do people often draw eyes when doodling?
"why is oxygen molecule paramagnetic, although it does not contain any unpaired electron?"
why did industrialists support prohibition?
why does evolution tend toward increased order despite the second law of thermodynamics?
why is there so much feminism in countries like usa and australia?


In [27]:
display(why_questions.agg(count('*').alias('total')))

total
43503


#### which

In [29]:
which_questions = lower_dataset_train_questions.select('question').where("question like 'which%'").dropDuplicates()
display(which_questions)

question
which is the best movie download site?
which are the best places to learn spanish abroad?
which companies in india hire freshers in finance?
which book is better for learning core java?
which are multiplayer games for pc?
which are the hit female singer songs before 2007 ? i am looking for one song which i liked but don't know the song name it was a popular hit song
which is the best book for android development for beginners?
which is the best country to live in?
which hand should be used for masturbation?
which is the best online shopping site in tamil nadu?


In [30]:
display(which_questions.agg(count('*').alias('total')))

total
21631


#### who

In [32]:
who_questions = lower_dataset_train_questions.select('question').where("question like 'who%'").dropDuplicates()
display(who_questions)

question
who is your favourite impressionist painter and why?
who would win in a one-on-one fight between savage hulk and doomsday?
who funded rypple.com?
who are some celebrities who admit they worship satan or the devil?
who are the best hair transplantation surgeons in hyderabad?
who are the best hollywood actors?
who invented the english alphabet?
who do you think is the best singer that has ever set foot on this world?
who are some lesser known important historical figures of mauritania and what should people know about them?
who is the best james bond and what is his best bond film?


In [33]:
display(who_questions.agg(count('*').alias('total')))

total
10133


#### how

In [35]:
how_questions = lower_dataset_train_questions.select('question').where("question like 'how%'").dropDuplicates()
display(how_questions)

question
how do i find a startup accelerator?
how can we solve the issue of kashmir?
how does banning 500 & 1000 rupee notes solve black money problem?
how are yif placements?
how often does an atm make a mistake in receiving or dispensing cash?
how useful is it to learn programming on codecademy?
how can i learn better in school/ how can i get better grades in school?
how does packet switched network differs from circuit switched network?
how can i activate a verizon phone with a call?
how can i gain weight and height?


In [36]:
display(how_questions.agg(count('*').alias('total')))

total
113794


#### others

In [38]:
other_questions = lower_dataset_train_questions.select('question').where("question!= 'what%' and question!= 'where%' and question!= 'when%' and question!= 'which%' and question!= 'why%' and question!= 'who%' and question!= 'how%'" )

In [39]:
display(other_questions)

question
how do i find a startup accelerator?
what is the future for ios developers?
hypothetical scenario: our actual credit card system & banking never existed before. how would you implement a digital currency on the internet today?
how can we solve the issue of kashmir?
"in a turbine, why is the final enthalpy (of the working substance) in an actual process greater than that of an isentropic process?"
how does banning 500 & 1000 rupee notes solve black money problem?
which is the best movie download site?
what are the chances that the original members of guns and roses will ever reunite?
how are yif placements?
what are the east way to hack whatsapp?


In [40]:
dataset_train_questions_unique = dataset_train_questions.dropDuplicates()

In [41]:
display(
  dataset_train_questions_unique.agg(count('*').alias('total'))
)

total
537349


In [42]:
# regex tokenizer
regexTokenizer = RegexTokenizer(inputCol="question", outputCol="words", pattern="\\W")
regex_tokenized = regexTokenizer.transform(dataset_train_questions_unique)

In [43]:
display(regex_tokenized)

question,words
"Will a Blu Ray play on a regular DVD player? If so, how?","List(will, a, blu, ray, play, on, a, regular, dvd, player, if, so, how)"
"What is the best way to invest $500 legally so that I can get tangible profits over a relatively short period of time, say 6 months?","List(what, is, the, best, way, to, invest, 500, legally, so, that, i, can, get, tangible, profits, over, a, relatively, short, period, of, time, say, 6, months)"
How long does it take for a dead body to float to the surface after drowning ?,"List(how, long, does, it, take, for, a, dead, body, to, float, to, the, surface, after, drowning)"
Which one is better among KMC Manipal and KMC Mangalore?,"List(which, one, is, better, among, kmc, manipal, and, kmc, mangalore)"
What does @ mean as an emoticon?,"List(what, does, mean, as, an, emoticon)"
What is a good song to lyric prank your best friend?,"List(what, is, a, good, song, to, lyric, prank, your, best, friend)"
Which Marvel movies have not included Stan Lee cameos?,"List(which, marvel, movies, have, not, included, stan, lee, cameos)"
Why does temperature decrease when altitude increases?,"List(why, does, temperature, decrease, when, altitude, increases)"
Should I be afraid of my parents?,"List(should, i, be, afraid, of, my, parents)"
What is the best and quick way to lose weight?,"List(what, is, the, best, and, quick, way, to, lose, weight)"


In [44]:
regex_tokenized.count()

In [45]:
tokenizer = Tokenizer(inputCol="question", outputCol="words")
tokenized = tokenizer.transform(dataset_train_questions_unique)

In [46]:
display(tokenized)

question,words
"Will a Blu Ray play on a regular DVD player? If so, how?","List(will, a, blu, ray, play, on, a, regular, dvd, player?, if, so,, how?)"
"What is the best way to invest $500 legally so that I can get tangible profits over a relatively short period of time, say 6 months?","List(what, is, the, best, way, to, invest, $500, legally, so, that, i, can, get, tangible, profits, over, a, relatively, short, period, of, time,, say, 6, months?)"
How long does it take for a dead body to float to the surface after drowning ?,"List(how, long, does, it, take, for, a, dead, body, to, float, to, the, surface, after, drowning, ?)"
Which one is better among KMC Manipal and KMC Mangalore?,"List(which, one, is, better, among, kmc, manipal, and, kmc, mangalore?)"
What does @ mean as an emoticon?,"List(what, does, @, mean, as, an, emoticon?)"
What is a good song to lyric prank your best friend?,"List(what, is, a, good, song, to, lyric, prank, your, best, friend?)"
Which Marvel movies have not included Stan Lee cameos?,"List(which, marvel, movies, have, not, included, stan, lee, cameos?)"
Why does temperature decrease when altitude increases?,"List(why, does, temperature, decrease, when, altitude, increases?)"
Should I be afraid of my parents?,"List(should, i, be, afraid, of, my, parents?)"
What is the best and quick way to lose weight?,"List(what, is, the, best, and, quick, way, to, lose, weight?)"


In [47]:
df1 = regex_tokenized.limit(1850)
df2 = regex_tokenized.subtract(df1)

In [48]:
display(df2.limit(1))

question,words
Why is sugar free sweeter?,"List(why, is, sugar, free, sweeter)"


In [49]:
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="words", outputCol="result")
model = word2Vec.fit(df2.limit(1)) #possible an issue with the window size vs words count in this sentence?

# result = model.transform(regex_tokenized)
# for row in result.collect():
#     text, vector = row
#     print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

In [50]:
# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])

# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)

result = model.transform(documentDF)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))