In [40]:
import os
os.chdir("C:\\Users\\prowm\\OneDrive\\Desktop\\Data Science\\Public Services")

%run -i "libraries.py"
%run -i "functions.py"

# 1. File reading

## 1.1 Reading .csv

In [2]:
try:
    data = pd.read_csv("NYT-merge.csv", sep=";", encoding="UTF-8")
    data.drop("Unnamed: 0", axis=1, inplace=True)
    print("Loaded comments: %s" %(len(data)))
    
except FileNotFoundError:
    print("Merge file not found, loading every .csv")
    
    articles_path = "C:\\Users\\prowm\\OneDrive\\Desktop\\Data Science\\Public Services\\Articles\\*.csv"
    comments_path = "C:\\Users\\prowm\\OneDrive\\Desktop\\Data Science\\Public Services\\Comments\\*.csv"
    articles_list = glob.glob(articles_path)
    comments_list = glob.glob(comments_path)

    
    # Loading data
    articles = get_articles(articles_path, articles_list)
    comments = get_comments(comments_path, comments_list)

    # Merge
    data = pd.merge(articles, comments, left_on="artID", right_on="comID", how="left").drop("comID", axis=1)

    
    data = data.dropna() # around 100k rows do not match any article ID
    data = data.reset_index()
    data = data.drop("index", axis=1)
    data = data[data.Keywords != ''] # remove empty keywords
    data = data.drop_duplicates(subset=["Comments"], keep="first")
    print("Loaded comments: %s" %(len(data)))

    data.to_csv("NYT-merge.csv", sep=";", encoding="UTF-8")

Loaded comments: 2040273


## 1.2 LDA model

In [3]:
try:
    ldamodel = gensim.models.LdaMulticore.load("model-9topic.gensim")
    print_topics=10
    topics = ldamodel.print_topics(num_words=print_topics)
    for topic in topics:
        print(topic)
        
except FileNotFoundError:
    print("Error: cannot load LDA model, file not found")

(0, '0.128*"Trump, Donald J" + 0.113*"United States Politics and Government" + 0.063*"Russia" + 0.043*"Presidential Election of 2016" + 0.037*"Federal Bureau of Investigation" + 0.029*"United States International Relations" + 0.028*"Comey, James B" + 0.026*"Cyberwarfare and Defense" + 0.023*"Russian Interference in 2016 US Elections and Ties to Trump Associates" + 0.019*"Special Prosecutors (Independent Counsel)"')
(1, '0.099*"Trump, Donald J" + 0.082*"United States Politics and Government" + 0.042*"Comey, James B" + 0.039*"Federal Bureau of Investigation" + 0.037*"Global Warming" + 0.032*"United Nations Framework Convention on Climate Change" + 0.020*"Crossword Puzzles" + 0.018*"United States International Relations" + 0.017*"Greenhouse Gas Emissions" + 0.015*"Clinton, Hillary Rodham"')
(2, '0.045*"Trump, Donald J" + 0.039*"Politics and Government" + 0.029*"United States International Relations" + 0.019*"France" + 0.018*"United States Politics and Government" + 0.017*"Macron, Emmanuel

In [4]:
data_newkey = lda_assign("NYT-9topic.csv", data, ldamodel)
#print(np.unique(data_newkey["Keywords"]))

# 2. Preprocessing

In [5]:
data_prep = temp = data_newkey

text = pre_processing(temp["Comments"], stopwords=0)
data_newkey.drop("Comments", axis=1, inplace=True)
data_newkey["Comments"] = text
data_newkey.to_csv("NYT-NewKeywords.csv", sep=";", encoding="UTF-8") # Saving

text_stop = pre_processing(temp["Comments"], stopwords=1)
data_prep.drop("Comments", axis=1, inplace=True)
data_prep["Comments"] = text_stop

# Remove short comments
data_prep = remove_short(data_prep, "Comments", 9) # Remove

# Lemmatization
lemmatizer = WordNetLemmatizer()
data_prep["Comments"] = lemmatization(data_prep["Comments"])

# Remove duplicate comments
data_prep = data_prep.drop_duplicates(subset=["Comments"], keep="first")

#print(data_prep.head()); print("\n", Counter(list(data_prep["Keywords"])))


data_prep.to_csv("NYT-Preprocessing.csv", sep=";", encoding="UTF-8") # Saving

100%|██████████████████████████████████████████████████████████████████████| 2040273/2040273 [07:55<00:00, 4286.76it/s]
100%|██████████████████████████████████████████████████████████████████████| 2040273/2040273 [04:09<00:00, 8174.28it/s]
100%|█████████████████████████████████████████████████████████████████████| 2040273/2040273 [00:42<00:00, 48440.79it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [13:15<00:00, 271.13s/it]


In [7]:
data_newkey = pd.read_csv("NYT-NewKeywords.csv", sep=";", encoding="UTF-8")
data_newkey = data_newkey.dropna()

## 2.1 Classification model comment list and tags

In [15]:
%%time
X_train, X_test, y_train, y_test = train_test_split(data_newkey["Comments"], 
                                                    data_newkey["Keywords"],
                                                    stratify=data_newkey["Keywords"],
                                                    test_size=0.3, random_state=0)

count_vect = CountVectorizer()
tfidf_vect = TfidfVectorizer(min_df=10, ngram_range=(2,2))
tfidf_transformer = TfidfTransformer()
X_train_count = count_vect.fit_transform(X_train)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)

clf = MultinomialNB(alpha=0.25, fit_prior=False).fit(X_train_tfidf, y_train)

# Constructor for sentiment
afinn = Afinn()

# Full list of comments
com_list = list(data_newkey["Comments"])
whole_list = [word for row in com_list for word in row.split(" ")]

Wall time: 3min 28s


In [11]:
# Collecting the tags for each words

nltk.download('universal_tagset')
word_tags = defaultdict(Counter)
for word, pos in tqdm(brown.tagged_words(tagset='universal')):
    word_tags[word][pos] += 1
    

brown_tags_words = []
for sent in brown.tagged_sents(tagset='universal'): # Simplified version
    brown_tags_words.append(("START", "START"))
    brown_tags_words.extend([(tag[:2], word) for (word, tag) in sent])
    brown_tags_words.append(("END", "END"))

cfd_tagwords = nltk.ConditionalFreqDist(brown_tags_words)
cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist)

brown_tags = [tag for (tag, word) in brown_tags_words]
cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(brown_tags))
cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist)
    
distinct_tags = set(brown_tags)

[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\prowm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\universal_tagset.zip.


100%|████████████████████████████████████████████████████████████████████| 1161192/1161192 [00:05<00:00, 199355.62it/s]


# 3. Algorithm to create a new sentence

In [55]:
"""
df           = the dataframe with less preprocessing
model        = the best classification model
length       = number of words that should be taken to inizialize the sentence
comment_list = list of all comments from where to take the words
alpha        = smoothing parameters to create an index which combines sentiment and frequence
end_seq      = number of element at the end of the sentence that are taken as input for expanding the sentence itself
n_match      = minimum number of following sequences found before reducing the end_seq
method       = if 0 the function strictly selects part of sentence with not null sentiment
               if 1 the function may add pertinent words with null sentiment
"""

final_function(df=data_newkey, model=clf, length=20, comment_list=whole_list, 
                     alpha=0.05, end_seq=8, n_match=3, method=1)

100%|██████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 200.24it/s]


Type a number as the sentiment upper bound 8
Sentiment threshold: 8
+------------+-------------------+
|       Pred | class             |
|------------+-------------------|
| 0.458383   | Ethics            |
| 0.192068   | Laws              |
| 0.0937807  | US_Elections_2016 |
| 0.0720251  | Guns              |
| 0.0594696  | News              |
| 0.0462122  | International     |
| 0.0429468  | US_Politics       |
| 0.0276211  | Environment       |
| 0.00749403 | (Social)_Media    |
+------------+-------------------+
{ ['those', 'who', 'need', 'them', 'and', 'frighten', 'even', 'more', 'people', 'into', 'the', 'shadows', 'i', 'will', 'not', 'answer', 'the', 'citizenship', 'question', 'should'] }	 predicted class: Ethics
{ those who need them and frighten even more people into the shadows i will not answer the citizenship question should }	 sentiment score: 0.0 

those who need them and frighten even more people into the shadows i will not answer the citizenship question should always b

['those',
 'who',
 'need',
 'them',
 'and',
 'frighten',
 'even',
 'more',
 'people',
 'into',
 'the',
 'shadows',
 'i',
 'will',
 'not',
 'answer',
 'the',
 'citizenship',
 'question',
 'should',
 'always',
 'be',
 'asked',
 'to',
 'help',
 'with',
 'the',
 'big',
 'grand',
 'opening',
 'party',
 'and',
 'beloved',
 'elder']