In [1]:
import json
import glob
import pandas as pd
import numpy as np
import pickle
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2

In [2]:
filenames = glob.glob("./data/*.json")
filenames.sort()
data = []
for item in filenames:
    with open(item, 'r') as file:
        d = file.read()
    data += json.loads(d)
len(data)

7079

In [3]:
### list of important fields
# anything after 'users' is in users
fields = ['id', 'text', 'user', 'id', 'name', 'screen_name', 'verified']

In [4]:
### extract data and form a dataset.  data key is its index in the originally loaded data
keys = np.arange(len(data)).tolist()
tweet_ids, texts, name, verified = [], [], [], []
for i in range(len(data)):
    tweet_ids.append(data[i]['id'])
    texts.append(data[i]['text'])
    name.append(data[i]['user']['name'])
    verified.append(data[i]['user']['name'])

In [47]:
data[0]

{'created_at': 'Thu Sep 12 18:10:20 +0000 2019',
 'id': 1172210852415643650,
 'id_str': '1172210852415643650',
 'text': 'That’s what Americans do when others are in need – we help. We give. We inspire. Want to make a difference? There a… https://t.co/2mdo9GlaLl',
 'truncated': True,
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [],
  'urls': [{'url': 'https://t.co/2mdo9GlaLl',
    'expanded_url': 'https://twitter.com/i/web/status/1172210852415643650',
    'display_url': 'twitter.com/i/web/status/1…',
    'indices': [117, 140]}]},
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'in_reply_to_status_id': 1172210761583812616,
 'in_reply_to_status_id_str': '1172210761583812616',
 'in_reply_to_user_id': 813286,
 'in_reply_to_user_id_str': '813286',
 'in_reply_to_screen_name': 'BarackObama',
 'user': {'id': 813286,
  'id_str': '813286',
  'name': 'Barack Obama',
  'screen_name': 'BarackObama',
  'location': 'Washington, DC'

In [5]:
df = pd.DataFrame({'id': keys, 'tweet_id':tweet_ids,
                   'content':texts, 'author':name,
                   'verified':verified})
df.head()

Unnamed: 0,id,tweet_id,content,author,verified
0,0,1172210852415643650,That’s what Americans do when others are in ne...,Barack Obama,Barack Obama
1,1,1172210761583812616,"Jermaine Bell is just six years old, but when ...",Barack Obama,Barack Obama
2,2,1172210649654616067,Bahamian families and communities face a long ...,Barack Obama,Barack Obama
3,3,1171760194898059266,"As we remember those we lost on 9/11, we also ...",Barack Obama,Barack Obama
4,4,1171090934257397760,Marca had a remarkable way of bringing out the...,Barack Obama,Barack Obama


In [6]:
print(df.loc[178]['content'])

We are grieving with Parkland. But we are not powerless. Caring for our kids is our first job. And until we can hon… https://t.co/LWGSkzRapH


# Text cleaning and preparation

In [7]:
# \r and \n
df['content_parsed_1'] = df['content'].str.replace("\r", " ")
df['content_parsed_1'] = df['content_parsed_1'].str.replace("\n", " ")
df['content_parsed_1'] = df['content_parsed_1'].str.replace("    ", " ")

In [8]:
# " when quoting text
df['content_parsed_1'] = df['content_parsed_1'].str.replace('"', '')

In [9]:
# Lowercasing the text
df['content_parsed_2'] = df['content_parsed_1'].str.lower()

In [10]:
# punctuation sign
punctuation_signs = list("?:!.,;")
df['content_parsed_3'] = df['content_parsed_2']

for punct_sign in punctuation_signs:
    df['content_parsed_3'] = df['content_parsed_3'].str.replace(punct_sign, '')

In [11]:
# possessive nouns
df['content_parsed_4'] = df['content_parsed_3'].str.replace("'s", "")

In [12]:
# Downloading punkt and wordnet from NLTK
nltk.download('punkt')
print("------------------------------------------------------------")
nltk.download('wordnet')

------------------------------------------------------------


[nltk_data] Downloading package punkt to /Users/daisy_lab/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/daisy_lab/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
# Saving the lemmatizer into an object
wordnet_lemmatizer = WordNetLemmatizer()

In [14]:
# iterate through every word to lemmatize
nrows = len(df)
lemmatized_text_list = []

for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = df.loc[row]['content_parsed_4']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)

In [15]:
df['content_parsed_5'] = lemmatized_text_list

In [16]:
# Downloading the stop words list
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daisy_lab/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
# Loading the stop words in english
stop_words = list(stopwords.words('english'))
stop_words[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [18]:
# Removing all stop words
df['content_parsed_6'] = df['content_parsed_5']

for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    df['content_parsed_6'] = df['content_parsed_6'].str.replace(regex_stopword, '')

# Label coding

In [19]:
category_codes = {
    'business': 0,
    'entertainment': 1,
    'politics': 2,
    'sport': 3,
    'tech': 4
}

# TF-IDF vectors feature

In [20]:
# Parameter election
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

# preparing data
X_test = df['content_parsed_6'].tolist()

In [21]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)

features_test = tfidf.fit_transform(X_test).toarray()
print(features_test.shape)

(7079, 300)


In [22]:
feature_names = tfidf.get_feature_names()
unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
bigrams = [v for v in feature_names if len(v.split(' ')) == 2]

In [23]:
bigrams

['actonclimate https',
 'add name',
 'climate change',
 'donald trump',
 'doyourjob https',
 'fair hear',
 'fake news',
 'governo bolsonaro',
 'health care',
 'http tco',
 'https tco',
 'judge garland',
 'live president',
 'meu deus',
 'obama speak',
 'president obama',
 'senate leaders',
 'supreme court',
 'unite state',
 'watch president',
 'years ago']

# Prediction

In [24]:
svc_path = "./models/best_svc.pickle"
tfidf_path = "./models/tfidf.pickle"
with open(svc_path, 'rb') as file:
    svc = pickle.load(file)
with open(tfidf_path, 'rb') as file:
    ft_engine = pickle.load(file)

In [35]:
feature_test = ft_engine.transform(X_test).toarray()

In [29]:
def get_category_name(category_id):
    for category, id_ in category_codes.items():    
        if id_ == category_id:
            return category

In [32]:
### making prediction
for i in range(len(X_test)):
    prediction_svc = svc.predict(feature_test[i].reshape(1, -1))[0]
    prediction_svc_proba = svc.predict_proba(feature_test[i].reshape(1, -1))

    # Return result
    category_svc = get_category_name(prediction_svc)
    print(i, "The predicted category using the SVM model is %s." %(category_svc) )
    print("The conditional probability is: %a" %(prediction_svc_proba.max()*100))
#print(prediction_svc_proba)

0 The predicted category using the SVM model is sport.
The conditional probability is: 43.30107224431667
1 The predicted category using the SVM model is sport.
The conditional probability is: 49.02145378438439
2 The predicted category using the SVM model is sport.
The conditional probability is: 80.68670049152159
3 The predicted category using the SVM model is sport.
The conditional probability is: 51.69305074157179
4 The predicted category using the SVM model is business.
The conditional probability is: 67.77086302801982
5 The predicted category using the SVM model is sport.
The conditional probability is: 77.7554766493244
6 The predicted category using the SVM model is sport.
The conditional probability is: 51.08935805851218
7 The predicted category using the SVM model is sport.
The conditional probability is: 37.32561397225625
8 The predicted category using the SVM model is sport.
The conditional probability is: 45.7544963821579
9 The predicted category using the SVM model is sport.

165 The predicted category using the SVM model is sport.
The conditional probability is: 45.98276782557641
166 The predicted category using the SVM model is sport.
The conditional probability is: 98.73290962053987
167 The predicted category using the SVM model is sport.
The conditional probability is: 37.883094867394156
168 The predicted category using the SVM model is sport.
The conditional probability is: 50.67637782875328
169 The predicted category using the SVM model is tech.
The conditional probability is: 71.04558169621365
170 The predicted category using the SVM model is sport.
The conditional probability is: 91.57502606492912
171 The predicted category using the SVM model is sport.
The conditional probability is: 90.07207211027739
172 The predicted category using the SVM model is sport.
The conditional probability is: 63.87759430882801
173 The predicted category using the SVM model is entertainment.
The conditional probability is: 99.97629199649182
174 The predicted category us

344 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
345 The predicted category using the SVM model is business.
The conditional probability is: 43.8442487316205
346 The predicted category using the SVM model is sport.
The conditional probability is: 67.19983342852119
347 The predicted category using the SVM model is sport.
The conditional probability is: 98.80668672472048
348 The predicted category using the SVM model is business.
The conditional probability is: 33.450095794140154
349 The predicted category using the SVM model is sport.
The conditional probability is: 46.96804612830021
350 The predicted category using the SVM model is business.
The conditional probability is: 38.06834628865056
351 The predicted category using the SVM model is sport.
The conditional probability is: 67.19983342852119
352 The predicted category using the SVM model is politics.
The conditional probability is: 48.63755568497972
353 The predicted categor

518 The predicted category using the SVM model is sport.
The conditional probability is: 46.968526109288625
519 The predicted category using the SVM model is sport.
The conditional probability is: 84.2120087101796
520 The predicted category using the SVM model is sport.
The conditional probability is: 59.17911099073906
521 The predicted category using the SVM model is sport.
The conditional probability is: 61.751843016357476
522 The predicted category using the SVM model is business.
The conditional probability is: 47.541823488737954
523 The predicted category using the SVM model is sport.
The conditional probability is: 46.41394617081587
524 The predicted category using the SVM model is sport.
The conditional probability is: 68.6099023065008
525 The predicted category using the SVM model is sport.
The conditional probability is: 63.15130729951607
526 The predicted category using the SVM model is tech.
The conditional probability is: 37.860321835977054
527 The predicted category using 

689 The predicted category using the SVM model is entertainment.
The conditional probability is: 71.8385082468235
690 The predicted category using the SVM model is business.
The conditional probability is: 32.94995687535618
691 The predicted category using the SVM model is sport.
The conditional probability is: 61.8213283934841
692 The predicted category using the SVM model is business.
The conditional probability is: 37.40949548137873
693 The predicted category using the SVM model is sport.
The conditional probability is: 65.16592578186422
694 The predicted category using the SVM model is entertainment.
The conditional probability is: 70.33661988606814
695 The predicted category using the SVM model is business.
The conditional probability is: 38.931271321373984
696 The predicted category using the SVM model is tech.
The conditional probability is: 35.8962995532371
697 The predicted category using the SVM model is entertainment.
The conditional probability is: 77.59121275289367
698 The

868 The predicted category using the SVM model is sport.
The conditional probability is: 47.437759359127725
869 The predicted category using the SVM model is sport.
The conditional probability is: 39.299992490490645
870 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
871 The predicted category using the SVM model is business.
The conditional probability is: 54.62242481187095
872 The predicted category using the SVM model is sport.
The conditional probability is: 47.62938889835634
873 The predicted category using the SVM model is entertainment.
The conditional probability is: 95.52500319654432
874 The predicted category using the SVM model is politics.
The conditional probability is: 46.15353139021374
875 The predicted category using the SVM model is business.
The conditional probability is: 56.87971119449029
876 The predicted category using the SVM model is business.
The conditional probability is: 56.87971119449029
877 The predict

1046 The predicted category using the SVM model is business.
The conditional probability is: 35.302232511174346
1047 The predicted category using the SVM model is business.
The conditional probability is: 42.717073453704465
1048 The predicted category using the SVM model is entertainment.
The conditional probability is: 70.58498382757752
1049 The predicted category using the SVM model is business.
The conditional probability is: 54.62242481187095
1050 The predicted category using the SVM model is business.
The conditional probability is: 40.65190786267945
1051 The predicted category using the SVM model is entertainment.
The conditional probability is: 81.81381593814623
1052 The predicted category using the SVM model is sport.
The conditional probability is: 35.07430942819724
1053 The predicted category using the SVM model is sport.
The conditional probability is: 80.62320753908043
1054 The predicted category using the SVM model is business.
The conditional probability is: 32.5712683557

1225 The predicted category using the SVM model is entertainment.
The conditional probability is: 70.01001819743993
1226 The predicted category using the SVM model is sport.
The conditional probability is: 78.34115280896621
1227 The predicted category using the SVM model is business.
The conditional probability is: 88.23552147724754
1228 The predicted category using the SVM model is business.
The conditional probability is: 55.42726727303052
1229 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
1230 The predicted category using the SVM model is entertainment.
The conditional probability is: 81.59736049617312
1231 The predicted category using the SVM model is sport.
The conditional probability is: 49.92424220626344
1232 The predicted category using the SVM model is business.
The conditional probability is: 29.210734677156303
1233 The predicted category using the SVM model is sport.
The conditional probability is: 44.6571170321503
123

1406 The predicted category using the SVM model is sport.
The conditional probability is: 66.04172783109031
1407 The predicted category using the SVM model is entertainment.
The conditional probability is: 60.64605355777072
1408 The predicted category using the SVM model is business.
The conditional probability is: 81.26061753633522
1409 The predicted category using the SVM model is entertainment.
The conditional probability is: 89.5629813751736
1410 The predicted category using the SVM model is entertainment.
The conditional probability is: 69.09450923215442
1411 The predicted category using the SVM model is business.
The conditional probability is: 81.26061753633522
1412 The predicted category using the SVM model is business.
The conditional probability is: 65.6343975234178
1413 The predicted category using the SVM model is sport.
The conditional probability is: 73.68826309809691
1414 The predicted category using the SVM model is sport.
The conditional probability is: 84.951126862508

1585 The predicted category using the SVM model is business.
The conditional probability is: 50.46124970713741
1586 The predicted category using the SVM model is business.
The conditional probability is: 37.67133543690944
1587 The predicted category using the SVM model is business.
The conditional probability is: 54.62242481187095
1588 The predicted category using the SVM model is business.
The conditional probability is: 60.064086672274286
1589 The predicted category using the SVM model is business.
The conditional probability is: 38.95244473446941
1590 The predicted category using the SVM model is business.
The conditional probability is: 46.67836202898029
1591 The predicted category using the SVM model is sport.
The conditional probability is: 58.776302747459255
1592 The predicted category using the SVM model is business.
The conditional probability is: 38.34216351272758
1593 The predicted category using the SVM model is business.
The conditional probability is: 54.45419240503043
15

1766 The predicted category using the SVM model is business.
The conditional probability is: 60.07458423774084
1767 The predicted category using the SVM model is business.
The conditional probability is: 42.888608384282044
1768 The predicted category using the SVM model is business.
The conditional probability is: 59.76178225988581
1769 The predicted category using the SVM model is business.
The conditional probability is: 40.58108053383685
1770 The predicted category using the SVM model is sport.
The conditional probability is: 68.6099023065008
1771 The predicted category using the SVM model is entertainment.
The conditional probability is: 73.9096705797577
1772 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
1773 The predicted category using the SVM model is sport.
The conditional probability is: 37.25283496798634
1774 The predicted category using the SVM model is business.
The conditional probability is: 42.447146268252226
1775 

1941 The predicted category using the SVM model is business.
The conditional probability is: 71.63094647877209
1942 The predicted category using the SVM model is business.
The conditional probability is: 39.076430339004865
1943 The predicted category using the SVM model is business.
The conditional probability is: 94.50152950252284
1944 The predicted category using the SVM model is business.
The conditional probability is: 46.02324199776492
1945 The predicted category using the SVM model is business.
The conditional probability is: 36.96789172482682
1946 The predicted category using the SVM model is sport.
The conditional probability is: 57.42454766711358
1947 The predicted category using the SVM model is sport.
The conditional probability is: 58.7136938317348
1948 The predicted category using the SVM model is business.
The conditional probability is: 52.96601923709026
1949 The predicted category using the SVM model is business.
The conditional probability is: 44.85762085633752
1950 Th

2109 The predicted category using the SVM model is sport.
The conditional probability is: 44.28394062283844
2110 The predicted category using the SVM model is sport.
The conditional probability is: 62.90425775612605
2111 The predicted category using the SVM model is sport.
The conditional probability is: 46.82660586299715
2112 The predicted category using the SVM model is tech.
The conditional probability is: 46.00332278131746
2113 The predicted category using the SVM model is politics.
The conditional probability is: 57.56435060201541
2114 The predicted category using the SVM model is business.
The conditional probability is: 55.303402275489525
2115 The predicted category using the SVM model is sport.
The conditional probability is: 35.27146472662792
2116 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
2117 The predicted category using the SVM model is politics.
The conditional probability is: 65.95485953134482
2118 The predicted 

2276 The predicted category using the SVM model is business.
The conditional probability is: 54.62242481187095
2277 The predicted category using the SVM model is politics.
The conditional probability is: 86.6510804912489
2278 The predicted category using the SVM model is business.
The conditional probability is: 60.45815374321338
2279 The predicted category using the SVM model is sport.
The conditional probability is: 35.04075427997802
2280 The predicted category using the SVM model is business.
The conditional probability is: 42.49483945928335
2281 The predicted category using the SVM model is sport.
The conditional probability is: 74.25784620705721
2282 The predicted category using the SVM model is business.
The conditional probability is: 92.92248593176706
2283 The predicted category using the SVM model is business.
The conditional probability is: 93.10234778134293
2284 The predicted category using the SVM model is sport.
The conditional probability is: 43.86745853765207
2285 The pr

2443 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
2444 The predicted category using the SVM model is entertainment.
The conditional probability is: 67.03735675966512
2445 The predicted category using the SVM model is sport.
The conditional probability is: 72.22247997198046
2446 The predicted category using the SVM model is business.
The conditional probability is: 82.24589134093027
2447 The predicted category using the SVM model is sport.
The conditional probability is: 39.299992490490645
2448 The predicted category using the SVM model is business.
The conditional probability is: 43.96597432193426
2449 The predicted category using the SVM model is business.
The conditional probability is: 46.1311906329603
2450 The predicted category using the SVM model is politics.
The conditional probability is: 37.05471624156157
2451 The predicted category using the SVM model is entertainment.
The conditional probability is: 84.52274876385746


2611 The predicted category using the SVM model is politics.
The conditional probability is: 37.52032263093682
2612 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
2613 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
2614 The predicted category using the SVM model is business.
The conditional probability is: 28.581256016578983
2615 The predicted category using the SVM model is sport.
The conditional probability is: 62.506326372427864
2616 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
2617 The predicted category using the SVM model is business.
The conditional probability is: 28.92248584452462
2618 The predicted category using the SVM model is business.
The conditional probability is: 60.45815374321338
2619 The predicted category using the SVM model is sport.
The conditional probability is: 65.14370059388779
2620 The predi

2769 The predicted category using the SVM model is sport.
The conditional probability is: 65.61380322377967
2770 The predicted category using the SVM model is politics.
The conditional probability is: 59.59193353780172
2771 The predicted category using the SVM model is sport.
The conditional probability is: 55.50747709049551
2772 The predicted category using the SVM model is business.
The conditional probability is: 58.142282243027
2773 The predicted category using the SVM model is business.
The conditional probability is: 32.42016496790323
2774 The predicted category using the SVM model is sport.
The conditional probability is: 43.79008496650656
2775 The predicted category using the SVM model is politics.
The conditional probability is: 41.95021791232224
2776 The predicted category using the SVM model is business.
The conditional probability is: 35.4042346032343
2777 The predicted category using the SVM model is politics.
The conditional probability is: 64.39850753166978
2778 The pred

2922 The predicted category using the SVM model is entertainment.
The conditional probability is: 79.16704082966145
2923 The predicted category using the SVM model is business.
The conditional probability is: 46.008155246244556
2924 The predicted category using the SVM model is business.
The conditional probability is: 69.52047709419519
2925 The predicted category using the SVM model is business.
The conditional probability is: 88.63416705117416
2926 The predicted category using the SVM model is sport.
The conditional probability is: 35.29829913078268
2927 The predicted category using the SVM model is sport.
The conditional probability is: 46.527942717230914
2928 The predicted category using the SVM model is business.
The conditional probability is: 34.144981418612794
2929 The predicted category using the SVM model is politics.
The conditional probability is: 49.75839785605789
2930 The predicted category using the SVM model is business.
The conditional probability is: 31.18547795705941

3082 The predicted category using the SVM model is tech.
The conditional probability is: 87.49730869550599
3083 The predicted category using the SVM model is business.
The conditional probability is: 71.20150350970421
3084 The predicted category using the SVM model is sport.
The conditional probability is: 93.16793584397736
3085 The predicted category using the SVM model is business.
The conditional probability is: 47.35786623398118
3086 The predicted category using the SVM model is business.
The conditional probability is: 63.96818954002964
3087 The predicted category using the SVM model is entertainment.
The conditional probability is: 43.729307660897746
3088 The predicted category using the SVM model is sport.
The conditional probability is: 29.565874911609797
3089 The predicted category using the SVM model is tech.
The conditional probability is: 94.50839135367531
3090 The predicted category using the SVM model is business.
The conditional probability is: 54.62242481187095
3091 The

3256 The predicted category using the SVM model is business.
The conditional probability is: 42.72271161970441
3257 The predicted category using the SVM model is sport.
The conditional probability is: 48.19631041008645
3258 The predicted category using the SVM model is sport.
The conditional probability is: 42.29857571926045
3259 The predicted category using the SVM model is politics.
The conditional probability is: 62.04997483635325
3260 The predicted category using the SVM model is business.
The conditional probability is: 50.31558154097681
3261 The predicted category using the SVM model is sport.
The conditional probability is: 81.37538739974292
3262 The predicted category using the SVM model is business.
The conditional probability is: 29.95814543068962
3263 The predicted category using the SVM model is sport.
The conditional probability is: 71.07568359834536
3264 The predicted category using the SVM model is business.
The conditional probability is: 57.71258343640522
3265 The pred

3426 The predicted category using the SVM model is sport.
The conditional probability is: 82.34262004016614
3427 The predicted category using the SVM model is sport.
The conditional probability is: 92.75502366802412
3428 The predicted category using the SVM model is business.
The conditional probability is: 37.217149421289015
3429 The predicted category using the SVM model is sport.
The conditional probability is: 33.58152672822436
3430 The predicted category using the SVM model is business.
The conditional probability is: 51.192340155089965
3431 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
3432 The predicted category using the SVM model is sport.
The conditional probability is: 43.98968374123049
3433 The predicted category using the SVM model is sport.
The conditional probability is: 48.33799000382557
3434 The predicted category using the SVM model is sport.
The conditional probability is: 63.782825888071926
3435 The predicted 

3598 The predicted category using the SVM model is sport.
The conditional probability is: 33.93703698366149
3599 The predicted category using the SVM model is sport.
The conditional probability is: 65.73718036587533
3600 The predicted category using the SVM model is sport.
The conditional probability is: 54.59104399784177
3601 The predicted category using the SVM model is entertainment.
The conditional probability is: 85.19324483712826
3602 The predicted category using the SVM model is business.
The conditional probability is: 59.22315348821348
3603 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
3604 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
3605 The predicted category using the SVM model is business.
The conditional probability is: 63.24517307016787
3606 The predicted category using the SVM model is sport.
The conditional probability is: 44.43221656670889
3607 The predi

3753 The predicted category using the SVM model is business.
The conditional probability is: 94.1101715547408
3754 The predicted category using the SVM model is business.
The conditional probability is: 77.31319535689735
3755 The predicted category using the SVM model is entertainment.
The conditional probability is: 90.7701643279013
3756 The predicted category using the SVM model is sport.
The conditional probability is: 59.813002549215064
3757 The predicted category using the SVM model is politics.
The conditional probability is: 40.08047608449358
3758 The predicted category using the SVM model is sport.
The conditional probability is: 61.216878850273694
3759 The predicted category using the SVM model is sport.
The conditional probability is: 49.912822166245554
3760 The predicted category using the SVM model is business.
The conditional probability is: 40.21454140350757
3761 The predicted category using the SVM model is tech.
The conditional probability is: 59.98668178569508
3762 The

3927 The predicted category using the SVM model is entertainment.
The conditional probability is: 71.0026126303122
3928 The predicted category using the SVM model is sport.
The conditional probability is: 72.61175977367351
3929 The predicted category using the SVM model is sport.
The conditional probability is: 82.33254434296478
3930 The predicted category using the SVM model is sport.
The conditional probability is: 69.55599315731943
3931 The predicted category using the SVM model is sport.
The conditional probability is: 47.883079223200006
3932 The predicted category using the SVM model is sport.
The conditional probability is: 47.942407258997285
3933 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
3934 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
3935 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
3936 The predicted 

4104 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4105 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4106 The predicted category using the SVM model is business.
The conditional probability is: 41.98766406527424
4107 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4108 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4109 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4110 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4111 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4112 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4113 The predicted catego

4275 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4276 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4277 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4278 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4279 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4280 The predicted category using the SVM model is sport.
The conditional probability is: 75.72581142907022
4281 The predicted category using the SVM model is business.
The conditional probability is: 72.46969749829556
4282 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4283 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4284 The predicted catego

4430 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4431 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4432 The predicted category using the SVM model is sport.
The conditional probability is: 75.72581142907022
4433 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4434 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4435 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4436 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4437 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4438 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4439 The predicted category 

4584 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4585 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4586 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4587 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4588 The predicted category using the SVM model is business.
The conditional probability is: 42.92925578127482
4589 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4590 The predicted category using the SVM model is business.
The conditional probability is: 42.92925578127482
4591 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4592 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4593 The predicted cat

4751 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4752 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4753 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4754 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4755 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4756 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4757 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4758 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4759 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4760 The predicted category 

4926 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4927 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4928 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4929 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4930 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4931 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4932 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4933 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4934 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
4935 The predicted category 

5095 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5096 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5097 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5098 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5099 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5100 The predicted category using the SVM model is tech.
The conditional probability is: 78.93598010945523
5101 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5102 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5103 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5104 The predicted category u

5255 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5256 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5257 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5258 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5259 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5260 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5261 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5262 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5263 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5264 The predicted category 

5418 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5419 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5420 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5421 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5422 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5423 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5424 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5425 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5426 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5427 The predicted category 

5596 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5597 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5598 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5599 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5600 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5601 The predicted category using the SVM model is business.
The conditional probability is: 65.45275159614108
5602 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5603 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5604 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5605 The predicted catego

5751 The predicted category using the SVM model is sport.
The conditional probability is: 77.99781654835007
5752 The predicted category using the SVM model is business.
The conditional probability is: 93.29622549279777
5753 The predicted category using the SVM model is business.
The conditional probability is: 80.23181534998318
5754 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5755 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5756 The predicted category using the SVM model is sport.
The conditional probability is: 67.29993157914372
5757 The predicted category using the SVM model is business.
The conditional probability is: 47.86542256011664
5758 The predicted category using the SVM model is entertainment.
The conditional probability is: 63.373987352776496
5759 The predicted category using the SVM model is business.
The conditional probability is: 54.832939537549194
5760 T

5905 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
5906 The predicted category using the SVM model is politics.
The conditional probability is: 42.45865014086231
5907 The predicted category using the SVM model is sport.
The conditional probability is: 46.975306912823214
5908 The predicted category using the SVM model is business.
The conditional probability is: 59.550882029025175
5909 The predicted category using the SVM model is business.
The conditional probability is: 75.99266150987542
5910 The predicted category using the SVM model is tech.
The conditional probability is: 49.56884149193389
5911 The predicted category using the SVM model is sport.
The conditional probability is: 62.18094077361349
5912 The predicted category using the SVM model is business.
The conditional probability is: 72.96410572334315
5913 The predicted category using the SVM model is business.
The conditional probability is: 66.80621093960318
5914 The pre

6064 The predicted category using the SVM model is sport.
The conditional probability is: 65.52884419499327
6065 The predicted category using the SVM model is politics.
The conditional probability is: 45.627560505101144
6066 The predicted category using the SVM model is sport.
The conditional probability is: 28.265666566185963
6067 The predicted category using the SVM model is business.
The conditional probability is: 50.044514034716215
6068 The predicted category using the SVM model is business.
The conditional probability is: 38.035574241349934
6069 The predicted category using the SVM model is sport.
The conditional probability is: 36.01397352716793
6070 The predicted category using the SVM model is sport.
The conditional probability is: 51.925757538794024
6071 The predicted category using the SVM model is politics.
The conditional probability is: 58.839942439353045
6072 The predicted category using the SVM model is business.
The conditional probability is: 64.76661488196712
6073 Th

6220 The predicted category using the SVM model is business.
The conditional probability is: 46.63522443170285
6221 The predicted category using the SVM model is sport.
The conditional probability is: 61.26415624120697
6222 The predicted category using the SVM model is business.
The conditional probability is: 98.37423193404655
6223 The predicted category using the SVM model is business.
The conditional probability is: 63.96791065581359
6224 The predicted category using the SVM model is sport.
The conditional probability is: 92.75502366802412
6225 The predicted category using the SVM model is entertainment.
The conditional probability is: 80.58697987776627
6226 The predicted category using the SVM model is sport.
The conditional probability is: 75.96130039452596
6227 The predicted category using the SVM model is sport.
The conditional probability is: 35.36944447182369
6228 The predicted category using the SVM model is business.
The conditional probability is: 38.94408220132895
6229 The

6382 The predicted category using the SVM model is politics.
The conditional probability is: 66.27823247015363
6383 The predicted category using the SVM model is sport.
The conditional probability is: 78.50169090762556
6384 The predicted category using the SVM model is sport.
The conditional probability is: 30.788404345889276
6385 The predicted category using the SVM model is sport.
The conditional probability is: 49.917786235827336
6386 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
6387 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
6388 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
6389 The predicted category using the SVM model is politics.
The conditional probability is: 59.899224925496455
6390 The predicted category using the SVM model is sport.
The conditional probability is: 81.57804725374767
6391 The predicted 

6546 The predicted category using the SVM model is sport.
The conditional probability is: 30.319247020875473
6547 The predicted category using the SVM model is sport.
The conditional probability is: 79.66304624628673
6548 The predicted category using the SVM model is tech.
The conditional probability is: 58.844515509202665
6549 The predicted category using the SVM model is sport.
The conditional probability is: 51.03607237111759
6550 The predicted category using the SVM model is business.
The conditional probability is: 37.224435793596385
6551 The predicted category using the SVM model is business.
The conditional probability is: 64.23835084325408
6552 The predicted category using the SVM model is business.
The conditional probability is: 70.32358239554766
6553 The predicted category using the SVM model is business.
The conditional probability is: 61.21984631930477
6554 The predicted category using the SVM model is business.
The conditional probability is: 83.75826630717408
6555 The pr

6717 The predicted category using the SVM model is sport.
The conditional probability is: 70.47589340136
6718 The predicted category using the SVM model is sport.
The conditional probability is: 78.78455538547297
6719 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
6720 The predicted category using the SVM model is sport.
The conditional probability is: 38.90085379856279
6721 The predicted category using the SVM model is sport.
The conditional probability is: 63.00705741719834
6722 The predicted category using the SVM model is sport.
The conditional probability is: 74.58910310680054
6723 The predicted category using the SVM model is sport.
The conditional probability is: 53.152920104888146
6724 The predicted category using the SVM model is business.
The conditional probability is: 45.78839714984826
6725 The predicted category using the SVM model is sport.
The conditional probability is: 57.066179450138655
6726 The predicted categor

6876 The predicted category using the SVM model is business.
The conditional probability is: 53.277088075167434
6877 The predicted category using the SVM model is business.
The conditional probability is: 56.702930488392965
6878 The predicted category using the SVM model is entertainment.
The conditional probability is: 75.27475628731302
6879 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
6880 The predicted category using the SVM model is politics.
The conditional probability is: 39.63012182778129
6881 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
6882 The predicted category using the SVM model is entertainment.
The conditional probability is: 58.27914192515181
6883 The predicted category using the SVM model is sport.
The conditional probability is: 92.75502366802412
6884 The predicted category using the SVM model is sport.
The conditional probability is: 99.99148720025117
6

7037 The predicted category using the SVM model is sport.
The conditional probability is: 39.28305087722438
7038 The predicted category using the SVM model is sport.
The conditional probability is: 42.62757318321137
7039 The predicted category using the SVM model is sport.
The conditional probability is: 92.75502366802412
7040 The predicted category using the SVM model is sport.
The conditional probability is: 37.86659674204891
7041 The predicted category using the SVM model is sport.
The conditional probability is: 62.17961001789681
7042 The predicted category using the SVM model is business.
The conditional probability is: 54.62242481187095
7043 The predicted category using the SVM model is tech.
The conditional probability is: 45.40265170788964
7044 The predicted category using the SVM model is business.
The conditional probability is: 38.814861805395154
7045 The predicted category using the SVM model is sport.
The conditional probability is: 48.055129776977964
7046 The predicted ca

In [46]:
X_test[132]

'elections matter    power  protect  family’ health care make  plan grab  friends  vote https//tco/ry8ip1km9p'

In [48]:
texts[132]

'Elections matter. You have the power to protect a family’s health care. Make a plan, grab some friends, and vote: https://t.co/rY8Ip1km9p'