# Chargement des datasets

**AG News Classification**

Représente 130 000 articles réparties en 4 catégories différentes

In [13]:
import pandas as pd

df_train = pd.read_csv(r"Dataset\AG_classif\AG_class_train.csv", header=0)
df_test = pd.read_csv(r"Dataset\AG_classif\AG_class_test.csv", header=0)

df_train = df_train.dropna()
df_test = df_test.dropna()

# Name_category : 1-World, 2-Sports, 3-Business, 4-Sci/Technology

df1 = pd.concat([df_train, df_test], axis=0, ignore_index=True)

x1 = df1.iloc[:,2]
y1 = df1.iloc[:,0]

x1 = df1.iloc[:, 1].astype(str) + " : " + x1.astype(str)
print(x1[1])
str(y1.unique())

Carlyle Looks Toward Commercial Aerospace (Reuters) : Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.


'[3 4 2 1]'

**DBpedia Ontology**

Dataset référent des pages wikipédia avec plus de 219 classe

In [14]:
import pandas as pd

df = pd.read_csv(r"Dataset\DBP\DBP_wiki_data.csv", header=0)

df2 = df.dropna()

x2 = df2["text"]
y2 = df2['l3']

list3 = str(df2['l3'].unique())

print(x2[1])
print(list3)


The 1917 Bali earthquake occurred at 06:50 local time on 21 January (23:11 on 20 January UTC). It had an estimated magnitude of 6.6 on the surface wave magnitude scale and had a maximum perceived intensity of IX (Violent) on the Mercalli intensity scale. It caused widespread damage across Bali, particularly in the south of the island. It triggered many landslides, which caused 80% of the 1500 casualties.
['Earthquake' 'SolarEclipse' 'MusicFestival' 'MilitaryConflict'
 'FilmFestival' 'Convention' 'FootballMatch' 'OlympicEvent' 'GrandPrix'
 'GolfTournament' 'WomensTennisAssociationTournament' 'TennisTournament'
 'SoccerTournament' 'WrestlingEvent' 'HorseRace' 'CyclingRace'
 'MixedMartialArtsEvent' 'Election' 'SoccerClubSeason'
 'NationalFootballLeagueSeason' 'NCAATeamSeason' 'BaseballSeason'
 'VideoGame' 'BiologicalDatabase' 'EurovisionSongContestEntry' 'Album'
 'Musical' 'ClassicalMusicComposition' 'ArtistDiscography' 'Single' 'Poem'
 'Magazine' 'Newspaper' 'AcademicJournal' 'Play' 'Man

# Partie preprocessing

## 1 - Split des datasets


In [15]:
from sklearn.model_selection import train_test_split


x_train1, x_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size=0.2, random_state=42)
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size=0.2, random_state=42)




## 2 - Nettoyage des données

•	Nettoyage : Suppression url, emot, caractères spéciaux  
•	Correction de la casse : Mettre tout en minuscule  
•	Tokénisation : Découpage du texte en pièces  
•	Lemmatisation : Supprimer uniqueùent les terminaisons inflexibles et donc à isoler la forme canonique du mot (lemme)  
•	Suppression des stop words : Enlever certains mots basiques prédéfinis  
•	Stemming :supprimer suffixe et préfixes  


In [18]:
import string
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.tag import pos_tag
from nltk.corpus import wordnet, stopwords, words
import re
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
nltk.download('words')

dictionnary = set(words.words())
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Donne la nature du mot (nom, verbe, adjectif, adverbe) pour la lemmatisation
def get_wordnet_pos(word):
  tag = pos_tag([word])[0][1][0].upper()
  tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
  return tag_dict.get(tag, wordnet.NOUN)

def preprocessing(text):
    
    # Suppression adresse mails et mot à un seul caractère
    text = re.sub(r'\b\w\b', '', text)
    text = re.sub(r'\b[\w.-]+@[\w.-]+\.[A-Za-z]{2,7}\b', '', text)
    
    # Séparation des phrases, suppression des espaces, mise en minuscule et suppression des chiffres
    text = text.strip()
    text= text.lower()
    text = ''.join(char for char in text if not char.isdigit())
    
    # Ponctuation
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    tokenized = word_tokenize(text)

    words_only = [word for word in tokenized if word.isalpha()]
    good_words = [word for word in words_only if word in dictionnary]
    without_stopwords = [word for word in good_words if not word in stop_words]

    # Lemmatisation
    lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in without_stopwords]
    cleaned_text = " ".join(lemmatized)
    return cleaned_text

#Pour test rapide
apres = preprocessing(x_train1[1])
print(apres)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\USER/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\USER/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to C:\Users\USER/nltk_data...
[nltk_data]   Package words is already up-to-date!


toward commercial private investment firm reputation make defense industry quietly another part market


In [19]:
x_train1 = x_train1.apply(preprocessing)
x_test1 = x_test1.apply(preprocessing)
x_train2 = x_train2.apply(preprocessing)
x_test2 = x_test2.apply(preprocessing)

## 3 - Création des datasets pour entrainement du modèle

### Enregistré dans le document Dataset

#### Dataset AG

In [None]:
# Vérification de la structure des datasets
print(x_train1.iloc[0])
print(y_train1.iloc[0])

print(x_train1.shape)
print(y_train1.shape)

print(x_test1.shape)
print(y_test1.shape)

x_train1.to_csv(r"Dataset\AG_classif\x_train.csv", index=False)
y_train1.to_csv(r"Dataset\AG_classif\y_train.csv", index=False)
x_test1.to_csv(r"Dataset\AG_classif\x_test.csv", index=False)
y_test1.to_csv(r"Dataset\AG_classif\y_test.csv", index=False)




mediation resolve dispute fiat general corp fiat spa meet next four try resolve dispute whether fiat force buy ail fiat auto subsidiary
3
(102080,)
(102080,)
(25520,)
(25520,)


#### Dataset DBP (wiki)

In [21]:
# Vérification de la structure des datasets
print(x_train2.iloc[0])
print(y_train2.iloc[0])

print(x_train2.shape)
print(y_train2.shape)

print(x_test2.shape)
print(y_test2.shape)

x_train2.to_csv(r"Dataset\DBP\x_train.csv", index=False)
y_train2.to_csv(r"Dataset\DBP\y_train.csv", index=False)
x_test2.to_csv(r"Dataset\DBP\x_test.csv", index=False)
y_test2.to_csv(r"Dataset\DBP\y_test.csv", index=False)

born former democratic congressman state colorado
Congressman
(274224,)
(274224,)
(68557,)
(68557,)
