In [None]:
# Importing necessary libraries
from gensim.models import Word2Vec, KeyedVectors  # Gensim library for Word2Vec model
import pandas as pd  # Pandas library for data manipulation
import nltk  # NLTK library for natural language processing

In [None]:
# Load the dataset from a CSV file
df = pd.read_csv('reddit_worldnews_start_to_2016-11-22.csv')

# Display the first few rows of the dataframe to understand its structure
df.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,subreddit
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews


In [None]:
# Download the 'punkt' tokenizer models from NLTK
# Uncomment the lines below to download the models if not already downloaded
# nltk.download('punkt')
# nltk.download('punkt_tab')

In [None]:
# Tokenize the titles in the dataframe using NLTK's word_tokenize function
newsVec = [nltk.word_tokenize(title) for title in df['title'].values]

# Display the first 5 tokenized titles to verify the result
newsVec[:5]

[['Scores', 'killed', 'in', 'Pakistan', 'clashes'],
 ['Japan', 'resumes', 'refuelling', 'mission'],
 ['US', 'presses', 'Egypt', 'on', 'Gaza', 'border'],
 ['Jump-start', 'economy', ':', 'Give', 'health', 'care', 'to', 'all'],
 ['Council', 'of', 'Europe', 'bashes', 'EU', '&', 'UN', 'terror', 'blacklist']]

In [None]:
# Train a Word2Vec model using the tokenized titles (newsVec)
# min_count=1 ensures that even words that appear only once are included in the model
model = Word2Vec(newsVec, min_count=1).wv

In [None]:
# Find the most similar words to 'man' using the trained Word2Vec model
similar_words = model.most_similar('man')

# Display the similar words
similar_words

[('woman', 0.9072763919830322),
 ('teenager', 0.8432095646858215),
 ('boy', 0.8381759524345398),
 ('girl', 0.824225127696991),
 ('couple', 0.7924955487251282),
 ('teen', 0.7605805397033691),
 ('mother', 0.7585114240646362),
 ('policeman', 0.7574959397315979),
 ('doctor', 0.7536249160766602),
 ('teacher', 0.7390760779380798)]

In [None]:
# Create a vector by adding the vectors for 'holiday', 'gifts', and 'winter'
vec = model['holiday'] + model['gifts'] + model['winter']

# Find the most similar words to the created vector
similar_words = model.most_similar([vec])

# Display the similar words
similar_words

[('holiday', 0.8276294469833374),
 ('holidays', 0.7679221630096436),
 ('gifts', 0.7360822558403015),
 ('Christmas', 0.7285211682319641),
 ('winter', 0.6602602005004883),
 ('festive', 0.6454193592071533),
 ('Thanksgiving', 0.6245847940444946),
 ('gift', 0.612133264541626),
 ('Christmastime', 0.6087110042572021),
 ('vacations', 0.5980472564697266)]

Descargamos el archivo y lo descomprimimos en la raiz del proyecto:
https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g

In [None]:
# Load the pre-trained Word2Vec model from the Google News dataset
# The model is in binary format, so we set binary=True
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
# Create a vector by adding the vectors for 'king' and 'woman' and subtracting the vector for 'man'
vec = model['king'] + model['woman'] - model['man']

# Find the most similar words to the created vector
similar_words = model.most_similar([vec])

# Display the similar words
similar_words

[('king', 0.52085942029953),
 ('woman', 0.5135486721992493),
 ('monarch', 0.48635631799697876),
 ('crown_prince', 0.47217562794685364),
 ('prince', 0.4661101698875427),
 ('princess', 0.45525479316711426),
 ('man', 0.4482707381248474),
 ('teenage_girl', 0.4421442151069641),
 ('girl', 0.42170172929763794),
 ('boy', 0.40749162435531616)]

In [None]:
# Create a vector by adding the vectors for 'holiday' and 'winter' and subtracting the vector for 'summer'
vec = model['holiday'] + model['winter'] - model['summer']

# Find the most similar words to the created vector
similar_words = model.most_similar([vec])

# Display the similar words
similar_words

[('winter', 0.3277788758277893),
 ('Thanksgiving', 0.2999882400035858),
 ('holiday', 0.2999754250049591),
 ('holidays', 0.2830084264278412),
 ('Labor_Day', 0.2706221342086792),
 ('summertime', 0.2701828181743622),
 ('springtime', 0.2672084867954254),
 ('wintry_weather', 0.26370859146118164),
 ('summer', 0.25874072313308716),
 ('spring', 0.2562393844127655)]