In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
articles_dataset_root_path = 'data/'
articles = pd.read_csv(articles_dataset_root_path + 'articles.csv', sep = ';', index_col = [0])

In [3]:
from nltk.stem.snowball import SnowballStemmer
from string import punctuation
from nltk.corpus import stopwords
from tqdm import tqdm

def preprocess_text(text, stopwords, punctuation, stemmer = SnowballStemmer('italian')):
    raw_string = str(text).lower() # Convert to lowercase for better matching later
    raw_string = raw_string.replace('–', '') # replace this damn special character
    [raw_string.replace(symbol, '') for symbol in punctuation] # Removing punctuation from string
    
    word_list = [stemmer.stem(word) for word in raw_string.split(' ') if word not in stopwords and len(word)>0] # Removing stopwords and making raw string into list
    return ' '.join(word_list)

In [4]:
ita_stopwords = set(stopwords.words('italian'))
ita_punctuation = set(punctuation) 
stemmer = SnowballStemmer('italian')

preprocessed_df = pd.DataFrame(data = [] , columns = articles.columns)

for i in tqdm(articles.index):
    row_data = articles.iloc[i].copy()

    row_data['title'] = preprocess_text(row_data['title'], ita_stopwords, ita_punctuation, stemmer)
    row_data['content'] = preprocess_text(row_data['content'], ita_stopwords, ita_punctuation, stemmer)
    
    preprocessed_df = preprocessed_df.append(row_data)
    
preprocessed_df

100%|██████████| 24584/24584 [03:42<00:00, 110.68it/s]


Unnamed: 0,title,content,date,author,region,zone
0,d’albert riunion anci regional decar,teram president gianguid d’albert partecip riu...,2020-04-15,Abruzzonews,Abruzzo,Centre
1,"coronavirus, novit introdott dpcm 10 april 2020",sit minister dell’intern stat pubblic dirett i...,2020-04-15,Abruzzonews,Abruzzo,Centre
2,"coronavirus, nuov misur vend gener alimentar a...","pescar oggi, mercoled 15 aprile, president reg...",2020-04-15,Abruzzonews,Abruzzo,Centre
3,"regione, ecco guid formazion distanz iefp",pescar ver propr guid oper grad rispond dubb i...,2020-04-15,Abruzzonews,Abruzzo,Centre
4,ministr amendol oddat all’iniz pd abruzz facebook,region “le scelt pd. l’ital l’abruzzo”: intito...,2020-04-15,Abruzzonews,Abruzzo,Centre
...,...,...,...,...,...,...
24579,"cop pir giornal telegram, salg 114 canal oscur",sal 114 numer canal telegram sequestr procur b...,2020-05-04,La Repubblica,Lazio,Centre
24580,"inchiest sanit umbra, chius indagini. l'ex gov...","chiusur dell'indagin assunzion sanità, destabi...",2020-05-04,La Repubblica,Lazio,Centre
24581,"coronavirus italia, 3691 denunc prim giorn fas 2","rom - aspett boom sanzion prim giorn fas due, ...",2020-05-05,La Repubblica,Lazio,Centre
24582,"coronavirus, medicina, denunc azione: ""tropp p...","ventiduemil medic pront entrar specialità, and...",2020-05-06,La Repubblica,Lazio,Centre


In [5]:
preprocessed_df.to_csv('data/preprocessed_articles.csv', sep = ';', na_rep = 'NULL')