In [1]:
import pandas
import ast
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
from collections import Counter
import json
import numpy as np

In [2]:
def load_class():
    CLASSES = set()
    with open("topics.txt", 'r', encoding="utf-8") as f:
        for line in f:
            CLASSES.add(str(line[:-1]))
    return list(CLASSES)

In [3]:
CLASSES = load_class()
stemmer = FrenchStemmer()
tokenizer = nltk.RegexpTokenizer(r'\w+')

In [4]:
def tokenize_and_stem(text):
    tokenizer.tokenize(text)
    tokens = [w for w in word_tokenize(text.lower()) if w.isalpha()]
    tokens = [t for t in tokens if t not in stopwords.words('french')]
    stems = [stemmer.stem(token) for token in tokens]
    return stems


In [5]:
def real_get_classes(text):
    labels = ast.literal_eval(text)
    topic = []
    for label in labels:
        topic.append(CLASSES.index(str(label)))
    return topic

In [6]:
def get_classes(text):
    return CLASSES.index(str(text))

In [7]:
def preprocess(filename):
    counter = Counter()
    df = pandas.read_csv(filename, delimiter="#", encoding="utf-8")
    with open("intermediate.csv", "w+", encoding="utf-8") as file:
        file.writelines("topic#text\n")
        for i in range(len(df)):
            topic = get_classes(df["topic"][i])

            stems = tokenize_and_stem(df["text"][i])
            counter += Counter(stems)

            file.writelines(str(topic) + "#" + str(stems) + "\n")

    vocabulary = {s: i for i, (s, c) in enumerate(counter.most_common(len(counter)))}

    with open("vocabulary.txt", "w+", encoding="utf-8") as file:
        file.writelines(str(vocabulary) + "\n")

In [8]:
def convert(vocabulary, text):
    stems = tokenize_and_stem(text)
    indexes = [vocabulary[s] for s in stems if s in vocabulary]
    return indexes

In [9]:
def load_voc():
    with open("vocabulary.txt", "r", encoding="utf-8") as file:
        voc = ast.literal_eval(file.readline())
    return voc

In [10]:
def load_sectors():
    df = pandas.read_csv("data.csv", delimiter=",")
    sirens = df["siren"].values
    df = None
    df2 = pandas.read_csv("naf2008_5_niveaux.csv", delimiter=";")

    sectors = set()
    with open("company_sector.csv", "w+") as out:
        out.writelines("siren#sector\n")
        with open("company_referentielle.json1", "r", encoding="utf-8") as file:
            for line in file:
                item = json.loads(line, encoding="utf-8")
                if item["siren"] in sirens:
                    sir = item["naf_ent"][:2]+'.'+item["naf_ent"][2:]
                    sectors.add(df2[df2["NIV5"] == sir]['NIV2'].values[0])
                    out.writelines(str(item["siren"]) + '#' + str(df2[df2["NIV5"] == sir]['NIV2'].values[0]) + "\n")

    print(len(sectors))
    with open("topics.txt", "w+") as f:
        for sector in sectors:
            f.writelines(str(sector) + "\n")

In [11]:
def filtering():
    df = pandas.read_csv("company_sector.csv", delimiter="#", encoding="utf-8")
    sirens = df["siren"].values
    df2 = pandas.read_csv("data.csv", delimiter=",", encoding="utf-8")
    valid = ["70", "72", "64", "35", "71", "66", "68", "63", "62", "58", "21", "82", "46", "26"]
    with open("filtered.csv", "w+", encoding="utf-8") as out:
        out.writelines('topic#text\n')
        for index, row in df2.iterrows():
            texts = row["text"]
            sector = str(df[df["siren"] == row["siren"]]["sector"].values[0])
            if sector in valid:
                out.writelines(sector + "#" + texts.replace("#", " ").replace("\n", " ") + "\n")


In [12]:
def split():
    df = pandas.read_csv("intermediate.csv", delimiter="#", encoding="utf-8")
    msk = np.random.rand(len(df)) < 0.95
    train = df[msk]
    valid = df[~msk]
    train.to_csv("train.csv")
    valid.to_csv("valid.csv")

In [13]:
def normalize_dataset():
    df = pandas.read_csv("final.csv", delimiter="#", encoding="utf-8")
    with open("final2.csv", "w+", encoding="utf-8")as out:
        out.writelines("topic#text\n")
        j = 0
        for i in range(len(df)):
            words = ast.literal_eval(df["text"][i])
            if len(words) > 500:
                out.writelines(str(df["topic"][i])+"#"+str(words[:500])+"\n")

In [14]:
def final_preprocess(fileout):
    df = pandas.read_csv("intermediate.csv", delimiter="#", encoding="utf-8")

    vocabulary = load_voc()

    with open(fileout, "w+", encoding="utf-8") as file:
        file.writelines("topic#text\n")
        for i in range(len(df)):
            stems = ast.literal_eval(df["text"][i])
            indexes = [vocabulary[s] for s in stems]
            file.writelines(str(df["topic"][i]) + "#" + str(indexes) + "\n")

In [15]:
load_sectors()
filtering()
preprocess("filtered.csv")
final_preprocess("final.csv")
normalize_dataset()
split()

In [16]:
def stats():
    results = {}
    with open("filtered.csv", "r", encoding="utf-8") as f:
        for line in f:
            if line[:-1] not in results:
                results[line[:-1]] = 1
            else:
                results[line[:-1]] = results[line[:-1]] + 1
    for key, value in results.items():
        if value > 99:
            print(str(key))

In [17]:
stats()