In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
import numpy as np
import regex as re
from nltk.stem import PorterStemmer 
from nltk.stem import WordNetLemmatizer 
import nltk
from wordcloud import WordCloud, STOPWORDS
import pycountry

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kareem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kareem\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Kareem\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [7]:
df=pd.read_csv('Desktop/sources.csv')
#df['cleaned_title']=df['Title']
df.drop(columns='Unnamed: 0',axis=1,inplace=True)
df.columns

Index(['Title', 'Risk', 'URL'], dtype='object')

In [8]:
df.head()
df['cleaned_title']=df['Title']
df=df.drop_duplicates(keep='first')
len(df)
df['cleaned_title'].head()

0    Inflation 'collapse' will spark big stock mark...
1    Here's what's in Biden framework to regulate c...
2    Ether falls after the smart contracts network ...
3    FedEx CEO has really lost credibility here, sa...
4    Ethereum's massive software upgrade just went ...
Name: cleaned_title, dtype: object

In [9]:
alpha2=[country.alpha_2 for country in pycountry.countries ]
alpha3=[country.alpha_3 for country in pycountry.countries ]
country_name=[country.name for country in pycountry.countries ]


In [10]:
def clean_up(s):
    """
    Cleans up numbers and special characters from a string.
    
    Args:
        s: The string to be cleaned up.

    Returns:
        A string that has been cleaned up.
    """
    
    
    
    s= re.sub('[0-9]',"", s)
    for x in alpha2:
        s= re.sub(x, "", s)
    for y in alpha3:
        s= re.sub(y, "", s)
    for j in country_name:
        s= re.sub(j, "", s)
    
    s= re.sub('[|](.*)', "", s)
    s= re.sub('[()]', "", s)
    s= re.sub('[/[/]]', "", s)
    s= re.sub("[-,#,',@,.,;,!,?,$,%,-,:,—,&,’,–]", "", s)
    
    s=s.lstrip()
    s=s.rstrip()
    return s

In [11]:
df['cleaned_title']=df['cleaned_title'].apply(clean_up)

In [13]:
df['cleaned_title'].head()

0    Inflation collapse will spark big stock market...
1    Heres whats in Biden framework to regulate crypto
2    Ether falls after the smart contracts network ...
3    FedEx CEO has really lost credibility here say...
4    Ethereums massive software upgrade just went l...
Name: cleaned_title, dtype: object

In [14]:
def tokenize(s):
    """
    Tokenize a string.

    Args:
        s: String to be tokenized.

    Returns:
        A list of words as the result of tokenization.
    """
    return word_tokenize(s)

In [15]:
df['cleaned_title']=df['cleaned_title'].apply(tokenize)

In [16]:
df['cleaned_title'].head()

0    [Inflation, collapse, will, spark, big, stock,...
1    [Heres, whats, in, Biden, framework, to, regul...
2    [Ether, falls, after, the, smart, contracts, n...
3    [FedEx, CEO, has, really, lost, credibility, h...
4    [Ethereums, massive, software, upgrade, just, ...
Name: cleaned_title, dtype: object

In [17]:
def remove_cap(s):
    x=[]
    for i in range(len(s)):
        if (s[i].istitle()== False | s[i].isupper()==False) & (len(s[i])>1):
            x.append(s[i])
    return x

In [18]:
df['cleaned_title']=df['cleaned_title'].apply(remove_cap)

In [19]:
df['cleaned_title'].head()

0    [collapse, will, spark, big, stock, market, ga...
1         [whats, in, framework, to, regulate, crypto]
2    [falls, after, the, smart, contracts, network,...
3    [FedEx, has, really, lost, credibility, here, ...
4    [massive, software, upgrade, just, went, live,...
Name: cleaned_title, dtype: object

In [20]:
def bicap_check(s):
    sum_cap=0
    for i in s:
        if i.isupper():
            sum_cap+=1
    if sum_cap>=2:
        return True
    else:
        return False

In [21]:
def remove_bicap(s):
    x=[]
    for i in range(len(s)):
        if bicap_check(s[i])==False:
            x.append(s[i])
    return x

In [22]:
df['cleaned_title']=df['cleaned_title'].apply(remove_bicap)

In [23]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
def stem_and_lemmatize(l):
    """
    Perform stemming and lemmatization on a list of words.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after being stemmed and lemmatized.
    """
    for i in range(len(l)):
        l[i]=ps.stem(l[i])
        l[i]=lemmatizer.lemmatize(l[i])
    return l

In [24]:
df['cleaned_title']=df['cleaned_title'].apply(stem_and_lemmatize)

In [25]:
df['cleaned_title'].head()

0     [collaps, will, spark, big, stock, market, gain]
1             [what, in, framework, to, regul, crypto]
2    [fall, after, the, smart, contract, network, c...
3               [ha, realli, lost, credibl, here, say]
4    [massiv, softwar, upgrad, just, went, live, he...
Name: cleaned_title, dtype: object

In [26]:
stop=STOPWORDS
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'all',
 'also',
 'am',
 'an',
 'and',
 'any',
 'are',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 "can't",
 'cannot',
 'com',
 'could',
 "couldn't",
 'did',
 "didn't",
 'do',
 'does',
 "doesn't",
 'doing',
 "don't",
 'down',
 'during',
 'each',
 'else',
 'ever',
 'few',
 'for',
 'from',
 'further',
 'get',
 'had',
 "hadn't",
 'has',
 "hasn't",
 'have',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'hence',
 'her',
 'here',
 "here's",
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 "how's",
 'however',
 'http',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'k',
 "let's",
 'like',
 'me',
 'more',
 'most',
 "mustn't",
 'my',
 'myself',
 'no',
 'nor',
 'not',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'otherwise',
 'ought',
 'our',
 

In [27]:
def remove_stopwords(l):
    """
    Remove English stopwords from a list of strings.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after stop words are removed.
    """
    for word in stop:
        while word in l:
            l.remove(word)
    return l

In [28]:
df['cleaned_title']=df['cleaned_title'].apply(remove_stopwords)

In [29]:
def joining(s):
    s=" ".join(s)
    return s

In [30]:
df['cleaned_title']=df['cleaned_title'].apply(joining)

In [31]:
df['cleaned_title'].head()

0    collaps will spark big stock market gain
1                      framework regul crypto
2    fall smart contract network complet merg
3                  ha realli lost credibl say
4         massiv softwar upgrad went live doe
Name: cleaned_title, dtype: object

In [252]:
df.drop(df.loc[df['cleaned_title']==''].index,inplace=True)

In [255]:
df.to_csv('Desktop/cleaned_data.csv')