---
# Import Libraries
---

In [1]:
import numpy as np 
import pandas 
import csv
import warnings
import nltk
import os
import time
from pathlib import Path
nltk.download('names', download_dir='/home/shared_data/textmining_genderrecognition_topicextraction/nltk')
from nltk.corpus import names
warnings.filterwarnings("ignore")
src='/home/shared_data/textmining_genderrecognition_topicextraction/dataset_text_mining_Reuters_RCV2/'

[nltk_data] Downloading package names to /home/shared_data/textmining_
[nltk_data]     genderrecognition_topicextraction/nltk...
[nltk_data]   Package names is already up-to-date!


---
# Upload Data
---

In [2]:
path_file=src+"rcv.csv"
df=pandas.read_csv(path_file)
print(len(df))
df.head(2)

806801


Unnamed: 0,title,headline,autore,code,text
0,"UK: UK shares set for nervous week,upside seen...","UK shares set for nervous week,upside seen lim...",Dale Faulken,"UK,M11,MCAT",The UK share market is unlikely to make much h...
1,USA: CBOT wheat ends mostly lower on weather.,CBOT wheat ends mostly lower on weather.,,"USA,M14,M141,MCAT",CBOT soft red winter wheat futures closed most...


In [3]:
industry_codes=pandas.read_csv(src+'industry_codes.csv')
region_codes=pandas.read_csv(src+'region_codes.csv')
topic_codes=pandas.read_csv(src+'topic_codes.csv')
topic_codes.head(2)

Unnamed: 0,topic_code,description
0,1POL,CURRENT NEWS - POLITICS
1,2ECO,CURRENT NEWS - ECONOMICS


---
# Data analysis
---

In [4]:
df_prova=df.copy()
df_without_duplicates = df_prova.drop_duplicates()
df_without_duplicates.reset_index(drop=True,inplace=True)
print(len(df_without_duplicates))
df_without_duplicates.head(2)

803070


Unnamed: 0,title,headline,autore,code,text
0,"UK: UK shares set for nervous week,upside seen...","UK shares set for nervous week,upside seen lim...",Dale Faulken,"UK,M11,MCAT",The UK share market is unlikely to make much h...
1,USA: CBOT wheat ends mostly lower on weather.,CBOT wheat ends mostly lower on weather.,,"USA,M14,M141,MCAT",CBOT soft red winter wheat futures closed most...


In [5]:
df_without_duplicates.isna().sum()

title            1
headline         8
autore      694436
code             0
text             4
dtype: int64

## Elimination of lines with nan text
---

In [6]:
df_without_duplicates=df_without_duplicates.dropna( subset=['text']).reset_index(drop=True)

In [7]:
df_without_duplicates.isna().sum()

title            1
headline         8
autore      694432
code             0
text             0
dtype: int64

---
# Gender labels creation
---

In [8]:
print('autori non nan (autori totali - autori quelli nan):',len(df_without_duplicates)-694436)
autori = df_without_duplicates[df_without_duplicates['autore'].notna()]['autore']
print('auotri Tot(nome e cognome univoci):',len(autori.unique()))

autori non nan (autori totali - autori quelli nan): 108630
auotri Tot(nome e cognome univoci): 2400


In [9]:
src_nltk_name='/home/shared_data/textmining_genderrecognition_topicextraction/nltk/corpora/names/'
males_txt = open(src_nltk_name+'male.txt','r')
females_txt= open(src_nltk_name+'female.txt','r')
males=[]
females=[]
for line in males_txt:
    males.append(line.strip().lower())
for line in females_txt:
    females.append(line.strip().lower())
males_txt.close()
females_txt.close()

males=np.unique(np.array(males))
females=np.unique(np.array(females))
print("females->",len(females),'females->',len(males))

females-> 4997 females-> 2943


In [10]:
def find_gender(autori,neutri):
    genere= {}
    first_name_autore=[]
    for autore in autori:
        if str(autore)!='nan':
            first_name=autore.split(' ')[0].lower()
            first_name_autore.append(first_name)
            if(first_name in list(neutri)):
                genere[first_name]='Neutral'
            elif(first_name in list(males)):
                genere[first_name]='Male'
            elif(first_name in list(females)):
                genere[first_name]='Female'
            else:
                genere[first_name]='Unknown'
        else:
            genere[str(autore)]='Unknown'
    print('nomi autori(univoci):',len(genere))
    genere_autori = np.array(df_without_duplicates['autore'])
    i=0
    for aut in genere_autori:
        if str(aut)=='nan':
            genere_autori[i]='Unknown'
        else:
            first_name=aut.split(' ')[0].lower()
            genere_autori[i]=genere[first_name]
        i+=1
    return genere_autori,genere,np.unique(np.array(first_name_autore))

autori = df_without_duplicates[df_without_duplicates['autore'].notna()]['autore'].unique()

In [11]:
# nomi neutrali nel db
neutri=[]
for n in males:
    if n in females:
        neutri.append(n)

genere_autori,genere,first_name_autore=find_gender(autori,neutri)

neutri_in_ds=[]
for i in first_name_autore:
    if i in neutri:
        neutri_in_ds.append(i)
print('neutri in ds:',len(neutri_in_ds))

non_trovati=[]
for k in genere.keys():
    if(genere[k]=='Unknown'):
        non_trovati.append(k)
print('valori',len(non_trovati),'Unknown')

df_without_duplicates['Genere']=genere_autori
conta_totale_righe_maschi_femm=((df_without_duplicates['Genere']=='Female').sum())+((df_without_duplicates['Genere']=='Male').sum())
print('conta totale righe maschi femm:',conta_totale_righe_maschi_femm,"su",len(df_without_duplicates))

df_without_duplicates.head(2)

nomi autori(univoci): 1182
neutri in ds: 77
valori 495 Unknown
conta totale righe maschi femm: 79659 su 803066


Unnamed: 0,title,headline,autore,code,text,Genere
0,"UK: UK shares set for nervous week,upside seen...","UK shares set for nervous week,upside seen lim...",Dale Faulken,"UK,M11,MCAT",The UK share market is unlikely to make much h...,Neutral
1,USA: CBOT wheat ends mostly lower on weather.,CBOT wheat ends mostly lower on weather.,,"USA,M14,M141,MCAT",CBOT soft red winter wheat futures closed most...,Unknown


---
# Enter the codes of the industries and topics regions
---

In [12]:
def splitta_codes(df_without_duplicates):
    cod=df_without_duplicates['code']
    industry_codes_list=industry_codes['industry_code'].values
    region_codes_list=region_codes['region_code'].values
    topic_codes_list=topic_codes['topic_code'].values
    df_code_industry=[]
    df_code_region=[]
    df_code_topic=[]

    i=0
    check=200000
    start = time.time()
    for code_record in cod.values:
        codici=code_record.split(',')
        single_industry=[]
        single_region=[]
        single_topic=[]
        for c in codici:
            if c in industry_codes_list:
                single_industry.append(c)
            elif c in region_codes_list:
                single_region.append(c)
            elif c in topic_codes_list:
                single_topic.append(c)
        df_code_industry.append(str(",".join(single_industry)))
        df_code_region.append(str(",".join(single_region)))
        df_code_topic.append(str(",".join(single_topic))) 
        if(i%check==0 and i!=0):
            print('elaborati',i,'record!')
        i+=1

    end = time.time()
    t = time.strftime("%Hh:%Mm:%Ss",time.gmtime(int(end)-int(start)))
    print("tempo impiegato per dividere le colonna in tre(region_codes, industries_codes e topics_codes) vettori è di: "+t)
    df_without_duplicates['industry_codes']=df_code_industry
    df_without_duplicates['region_codes']=df_code_region
    df_without_duplicates['topic_codes']=df_code_topic
splitta_codes(df_without_duplicates)
df_without_duplicates.head(2)

elaborati 200000 record!
elaborati 400000 record!
elaborati 600000 record!
elaborati 800000 record!
tempo impiegato per dividere le colonna in tre(region_codes, industries_codes e topics_codes) vettori è di: 00h:04m:35s


Unnamed: 0,title,headline,autore,code,text,Genere,industry_codes,region_codes,topic_codes
0,"UK: UK shares set for nervous week,upside seen...","UK shares set for nervous week,upside seen lim...",Dale Faulken,"UK,M11,MCAT",The UK share market is unlikely to make much h...,Neutral,,UK,"M11,MCAT"
1,USA: CBOT wheat ends mostly lower on weather.,CBOT wheat ends mostly lower on weather.,,"USA,M14,M141,MCAT",CBOT soft red winter wheat futures closed most...,Unknown,,USA,"M14,M141,MCAT"


---
# Writing Prepared data
---

In [13]:
if not Path(src+"clean_dataset_with_label.csv").exists():
    df_without_duplicates.to_csv(src+"clean_dataset_with_label.csv", index=False)

---
# Multilangual dataset preparation
---

In [14]:
path_file_multilingual=src+"rcv_multilingual.csv"
df_multilingual=pandas.read_csv(path_file_multilingual)
print(len(df_multilingual))
df_multilingual.head(2)

487357


Unnamed: 0,title,headline,autore,code,text
0,,[台灣央行]標售50億台幣1個月期NCDs加權平均得標利率為6.385%,,"ASIAZ,DEVGCOZ,EASIAZ,TAIWAN,M13,MCAT",〔路透社台北24日電〕 台灣央行週四午後標售50億台幣的1個月期可轉讓定期存單(NCDs...
1,,美國經濟學家預期今年民間借款需求相當不錯,,"NAMZ,USA,USAZ,USW,E12,ECAT","〔路透社紐約29日電〕 美國著名經濟學家考夫曼表示,美國經濟成長強勁,加上短期利率僅會小..."


In [15]:
df_prova_multilingual=df_multilingual.copy()
df_without_duplicates_multilingual = df_prova_multilingual.drop_duplicates()
df_without_duplicates_multilingual.reset_index(drop=True,inplace=True)
print(len(df_without_duplicates_multilingual))
df_without_duplicates_multilingual.head(2)

439495


Unnamed: 0,title,headline,autore,code,text
0,,[台灣央行]標售50億台幣1個月期NCDs加權平均得標利率為6.385%,,"ASIAZ,DEVGCOZ,EASIAZ,TAIWAN,M13,MCAT",〔路透社台北24日電〕 台灣央行週四午後標售50億台幣的1個月期可轉讓定期存單(NCDs...
1,,美國經濟學家預期今年民間借款需求相當不錯,,"NAMZ,USA,USAZ,USW,E12,ECAT","〔路透社紐約29日電〕 美國著名經濟學家考夫曼表示,美國經濟成長強勁,加上短期利率僅會小..."


In [16]:
df_without_duplicates_multilingual.isna().sum()

title       439495
headline         1
autore      439495
code             2
text             1
dtype: int64

In [17]:
df_without_duplicates_multilingual=df_without_duplicates_multilingual.dropna( subset=['text','code']).reset_index(drop=True)
df_without_duplicates_multilingual.isna().sum()

title       439493
headline         0
autore      439493
code             0
text             0
dtype: int64

## Creation of multilingual gender labels
---

In [18]:
print('autori non nan (autori totali - autori quelli nan):',len(df_without_duplicates_multilingual)-439494)
autori_multilingual = df_without_duplicates_multilingual[df_without_duplicates_multilingual['autore'].notna()]['autore']
print('auotri Tot(nome e cognome univoci):',len(autori_multilingual.unique()))

autori non nan (autori totali - autori quelli nan): -1
auotri Tot(nome e cognome univoci): 0


In [19]:
df_without_duplicates_multilingual['autore']='Unknown'
df_without_duplicates_multilingual.head(2)

Unnamed: 0,title,headline,autore,code,text
0,,[台灣央行]標售50億台幣1個月期NCDs加權平均得標利率為6.385%,Unknown,"ASIAZ,DEVGCOZ,EASIAZ,TAIWAN,M13,MCAT",〔路透社台北24日電〕 台灣央行週四午後標售50億台幣的1個月期可轉讓定期存單(NCDs...
1,,美國經濟學家預期今年民間借款需求相當不錯,Unknown,"NAMZ,USA,USAZ,USW,E12,ECAT","〔路透社紐約29日電〕 美國著名經濟學家考夫曼表示,美國經濟成長強勁,加上短期利率僅會小..."


---
# Enter the codes of the industries and topics regions into multilingual Dataset
---

In [20]:
splitta_codes(df_without_duplicates_multilingual)
df_without_duplicates_multilingual.head(2)

elaborati 200000 record!
elaborati 400000 record!
tempo impiegato per dividere le colonna in tre(region_codes, industries_codes e topics_codes) vettori è di: 00h:04m:44s


Unnamed: 0,title,headline,autore,code,text,industry_codes,region_codes,topic_codes
0,,[台灣央行]標售50億台幣1個月期NCDs加權平均得標利率為6.385%,Unknown,"ASIAZ,DEVGCOZ,EASIAZ,TAIWAN,M13,MCAT",〔路透社台北24日電〕 台灣央行週四午後標售50億台幣的1個月期可轉讓定期存單(NCDs...,,TAIWAN,"M13,MCAT"
1,,美國經濟學家預期今年民間借款需求相當不錯,Unknown,"NAMZ,USA,USAZ,USW,E12,ECAT","〔路透社紐約29日電〕 美國著名經濟學家考夫曼表示,美國經濟成長強勁,加上短期利率僅會小...",,USA,"E12,ECAT"


---
## Writing Prepared multilingual data
--- 

In [21]:
if not Path(src+"clean_dataset_with_label_multilingual.csv").exists():
    df_without_duplicates_multilingual.to_csv(src+"clean_dataset_with_label_multilingual.csv", index=False)