In [1]:
# !pip install transformers

In [2]:
# !pip install transformers torch pytesseract

In [2]:
from deep_translator import GoogleTranslator
# 1 - DATA MANIPULATION
import pandas as pd
import numpy as np

# 2 - DATA VISUALISATION
import matplotlib.pyplot as plt
import seaborn as sns

# # 3 - STATISTICS
# from statsmodels.graphics.gofplots import qqplot

# 4 - MACHINE LEARNING
## 4.1 - Preprocessing
from sklearn.model_selection import train_test_split
### 4.1.1 - Scalers
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

### 4.1.2 - Encoders
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

### 4.1.3 - Crossvalidation, Training, Model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### 4.1.4 - Evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics  import ConfusionMatrixDisplay

pd.set_option('display.max_colwidth', None)

# from transformers import pipeline

In [3]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [4]:
import pandas as pd
import ast

def load_projects(filterLive=True):
    '''
    read the raw csv of projects
    without state=='live' if filterLive=True,
    prepare for the merge ('ID' renamed to 'id')
    '''
    df_projects = pd.read_csv('../data/raw/ks-projects-201801.csv')
    df_projects.rename(columns={'ID':'id'}, inplace=True)
    if filterLive :
        df_projects = df_projects[df_projects['state']!='live']
        df_projects['state'] = df_projects['state'].apply(lambda x: 1 if x == 'successful' else 0)

    return df_projects

def load_commentaires():
    '''
    read the raw csv of comments
    by filtering out the empty coments
    '''
    df_comments = pd.read_csv('../data/raw/comments_clean.csv')
    df_comments = df_comments[df_comments['comments']!='[]']
    # cast string as py list
    df_comments['commentaires'] = df_comments['comments'].apply(ast.literal_eval)
    df_comments.drop(columns=['comments'], inplace=True)

    return df_comments

def load_merged_data(ligne_par_ligne=True):
    '''
    merge the two df : comments and projects
    ligne par ligne si ligne_par_ligne==True,
    sinon par projet
    '''
    df_comments = load_commentaires()
    df_projects = load_projects()

    df_merged = (
        df_comments.merge(
            df_projects[['id', 'state']]
        )
        #.drop(columns=['id'])
    )

    if ligne_par_ligne :
        df_merged = df_merged.explode('commentaires').reset_index(drop=True)
    else :
        df_merged['commentaires'] = df_merged['commentaires'].apply(
            lambda x: '; '.join(x)
        )

    return df_merged.rename(columns={"commentaires": "X", 'state':'y'})

In [5]:
# !pip install -r requirements.txt
import pandas as pd

import string

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer

# debug local:
import data

import re

stop_words = set(stopwords.words('english')) ## define stopw
lemmatizer = WordNetLemmatizer()

# Nettoyage de la ponctuation, chiffres et mise en minuscule
def cleaning(sentence):

    '''
    Nettoie une phrase :
    - supprime les espaces en trop
    - met en minuscules
    - supprime les chiffres et la ponctuation
    '''
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers

    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    return sentence

# Tokenisation simple (via nltk)
def tokenize(sentence):

    '''
    Tokenise une phrase en liste de mots
    '''
    tokenized_sentence = word_tokenize(sentence) ## tokenize
    return tokenized_sentence


# def removing_words(tokenized_sentence):
#     '''
#     Supprime les stopwords de la phrase tokenisée
#     '''
#     ## REMOVE WORD if contains non alphabet character check string
#     tokenized_sentence_cleaned = [ ## remove stopwords
#         w for w in tokenized_sentence if not w in stop_words
#     ]
#     return tokenized_sentence_cleaned

def removing_words(tokenized_sentence):
    '''
    Supprime les stopwords et les mots qui ne sont pas composés uniquement de lettres (avec accents) et chiffres.
    '''
    tokenized_sentence_cleaned = [
        w for w in tokenized_sentence
        if (w not in stop_words) and re.fullmatch(r'[a-zA-ZÀ-ÿ0-9]+', w)
    ]
    return tokenized_sentence_cleaned
# Lemmatisation des mots
def lemmatizing(tokenized_sentence_cleaned):

    '''
    Lemmatisation des mots d'abord comme verbes, puis comme noms
    '''
    # Étape 1 : lemmatisation comme verbes
    verb_lemmatized = [
        lemmatizer.lemmatize(word, pos="v")
        for word in tokenized_sentence_cleaned
    ]

    # Étape 2 : lemmatisation comme noms
    noun_lemmatized = [
        lemmatizer.lemmatize(word, pos="n")
        for word in verb_lemmatized
    ]

    return noun_lemmatized


# Fonction complète de nettoyage d'une phrase (pipeline)
def preprocessing_sentence(comment: str ,
                           tokenized: bool=True,
                           removed_word:bool=True,
                           lemmatized:bool=True)->str:
    '''
    Applique les étapes NLP à une phrase :
    - nettoyage
    - tokenisation (optionnelle)
    - suppression des stopwords (optionnelle)
    - lemmatisation (optionnelle)
    Retourne la phrase nettoyée sous forme de string.
    '''
    clean_word=cleaning(comment)

    if tokenized:
        clean_word=tokenize(clean_word)

    if removed_word:
        clean_word=removing_words(clean_word)

    if lemmatized:
        clean_word=lemmatizing(clean_word)

    #assurer de retourner une string
    if isinstance(clean_word, list):
        return ' '.join(word for word in clean_word)
    else:
        return clean_word

# Applique preprocessing_phrase à chaque ligne d'un DataFrame
def preprocessing(X: pd.DataFrame,
                  ngram_range=(1,1),
                  tokenized: bool = True,
                  removed_word: bool = True,
                  lemmatized: bool = True
                  ) -> pd.DataFrame :
    '''
    Applique le prétraitement NLP (sans vectorisation) à chaque ligne d’un DataFrame.
    Retourne une colonne contenant les phrases nettoyées.
    '''
    return X.apply(preprocessing_sentence,tokenized=tokenized,removed_word=removed_word,lemmatized=lemmatized)

# Applique preprocessing + vectorisation avec CountVectorizer
def preprocessing_with_vectorization(X: pd.DataFrame, ngram_range=(1,1)) : #renvoi : scipy.sparse._csr.csr_matrix
    '''
    Applique le prétraitement NLP puis vectorise les phrases avec CountVectorizer.
    Retourne une matrice creuse (sparse matrix) de type csr_matrix.
    '''
    X_preproc =X.apply(preprocessing_sentence)
    # vectorization :
    count_vectorizer = CountVectorizer(ngram_range=ngram_range)
    X_preproc = count_vectorizer.fit_transform(X_preproc)
    #print(type(X_preproc))
    return X_preproc


In [6]:
import numpy as np

from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score

cleaned_comments = load_merged_data(ligne_par_ligne=True)#.iloc[:10000]
X = cleaned_comments['X']
y = cleaned_comments['y']


preproc_X =  preprocessing(X,
                                             tokenized = True,
                                             removed_word = True,
                                             lemmatized=True)

#voir le vocabulaire
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(preproc_X)
print(f"Nombre total de mots dans le vocabulaire : {len(tfidf_vectorizer.vocabulary_)}")

pipeline_naive_bayes = make_pipeline(
    TfidfVectorizer(),
    MultinomialNB()
)

cv_results = cross_validate(pipeline_naive_bayes, preproc_X, y, cv = 5, scoring = ["recall"])
average_recall = cv_results["test_recall"].mean()
results = average_recall
print(results)


Nombre total de mots dans le vocabulaire : 76962
0.9999525509480648


In [7]:
# 1. Fit une fois juste pour voir le vocabulaire
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(preproc_X)
print(f"Nombre total de mots dans le vocabulaire : {len(tfidf_vectorizer.vocabulary_)}")


Nombre total de mots dans le vocabulaire : 76962


In [8]:
cleaned_comments.head()

Unnamed: 0,id,X,y
0,1000468345,Electrical Solutions Group installed the iRNinja at my parents house and they love it They use to call me every night so that they could watch Netflix would have to walk them through changing the TV source to their Roku turning on their audio receiver and starting Netflix Not anymore now they just press one button,0
1,1000468345,Awesome product would like to get one asap Wish you best of luck,0
2,1000629643,Alex In my country taxes and fees are really high when you send the ODIN if you declare the ODIN having low value like chinese websites it would be very helpfull for me and maybe for more people,1
3,1000629643,Watching was thinking about your proposition and think it not really honest In my mind it impossible doing that but thank you for your help Daniel called this morning to rent projector and can take it tomorrow Thank you for your help didn know it was possible to rent projectors It quite expensive months renting and can pay new projector You said the MSRP of ODIN is don understand what you mean with that backed this project and will pay lot of taxes when will receive it think Because spent less money that the MSRP you think can spend more money now Not sure understand,1
4,1000629643,Monthieu laurent It was my choice having my personals computer and projector Because had to many problems with the company hardware Without this choice wouldn back this project don need portative projector to stay home Do you understand my opinion don want to fight with you or with Alex have the laptop but need projector don want to buy an other hate spending money,1


In [9]:
raw_project= load_projects()

In [10]:
raw_project

Unnamed: 0,id,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,0,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,0,15,US,100.0,2421.0,30000.00
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,0,3,US,220.0,220.0,45000.00
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,0,1,US,1.0,1.0,5000.00
4,1000011046,Community Film Project: The Art of Neighborhood Filmmaking,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,0,14,US,1283.0,1283.0,19500.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378656,999976400,ChknTruk Nationwide Charity Drive 2014 (Canceled),Documentary,Film & Video,USD,2014-10-17,50000.0,2014-09-17 02:35:30,25.0,0,1,US,25.0,25.0,50000.00
378657,999977640,The Tribe,Narrative Film,Film & Video,USD,2011-07-19,1500.0,2011-06-22 03:35:14,155.0,0,5,US,155.0,155.0,1500.00
378658,999986353,Walls of Remedy- New lesbian Romantic Comedy feature unlike any other!!,Narrative Film,Film & Video,USD,2010-08-16,15000.0,2010-07-01 19:40:30,20.0,0,1,US,20.0,20.0,15000.00
378659,999987933,BioDefense Education Kit,Technology,Technology,USD,2016-02-13,15000.0,2016-01-13 18:13:53,200.0,0,6,US,200.0,200.0,15000.00


In [11]:
!pip install langid



In [12]:
# import fasttext
# import pandas as pd

# # Charger le modèle de langue FastText
# ft_model = fasttext.load_model('lid.176.bin')

# # Fonction de détection de langue
# def detect_fasttext(text):
#     labels, probs = ft_model.predict(text)
#     labels = list(labels)
#     probs = list(probs)
#     label = labels[0].replace('__label__', '')
#     prob = float(probs[0])
#     return label, prob

# # # Appliquer la fonction
# # df[['lang', 'prob']] = df['X'].apply(detect_fasttext).apply(pd.Series)
# test=detect_fasttext('Bonjour, comment allez-vous ?')
# # Afficher le résultat
# print(test)


In [13]:

import langid
df = cleaned_comments.copy()
def detect_lang_and_proba(df, column_name='X'):
    """
    Ajoute deux colonnes au DataFrame :
    - 'langue' : code langue détecté
    - 'langue_proba' : probabilité associée à la langue détectée

    Paramètres :
    - df : DataFrame contenant la colonne à analyser
    - column_name : nom de la colonne contenant le texte (par défaut 'X')
    """
    langues = []
    probabilites = []

    for text in df[column_name]:
        langue, proba = langid.classify(text)
        langues.append(langue)
        probabilites.append(proba)

    df['langue'] = langues
    df['langue_proba'] = probabilites
    return df

In [14]:
import re


# Expression régulière pour détecter les caractères cyrilliques
pattern_cyrillic = re.compile('[\u0400-\u04FF]')

# Fonction pour vérifier si un texte contient du russe (caractères cyrilliques)
def contains_russian(text):
    return bool(pattern_cyrillic.search(text))

# Filtrer les commentaires contenant du russe
df_russian = df[df['X'].apply(contains_russian)]

# Afficher combien et quelques exemples
print(f"Nombre de commentaires contenant du russe : {len(df_russian)}")
df_russian

Nombre de commentaires contenant du russe : 15


Unnamed: 0,id,X,y
11419,1089972264,With the app being so much better to use for the Fluent Forever method than Anki find that am studying more more frequently than before the app One problem am having with the minimal pairs is when they us an English word and the recording uses pronounciation that uses sounds that aren used in Australian English making me have to learn two new sounds of which only one is useful to me An example of this is with the Russian Сэт and English set As Australian English weakens final t by either not aspirating it or replacing it with glotal stop and the t in the recording for set is strongly aspirated this makes the Russian sound more natural and the English sound more foreign I wondering if other non Americans are having similar problems with the use of US dialect for the English pronounciations Or is it just me,1
11468,1089972264,As can Beta test the app I ve been looking at the Minimal Pairs from the Anki version and have couple of suggestions examples will be from the Russian version as that is what am using It would be nice to have an indication on what we are meant to learn from each pair especially when of the pair is in English Example Гир and gear it isn clear to beginner what Russian sound is used for the ea in gear so not sure what am meant to be learning and if it is obvious what am meant to be learning then have already learnt it On some of the pairs the most obvious difference between the words is clearly not the difference that is meant to be learnt so being told what we are meant to learn would help will make it wasier to focuss on the real lesson Example the most obvious difference to my ears between the Russian дай and English die is the tone as in Vietnamese or other tonal languages making it harder to pick out the difference should be focussing on Being told what need to focuss on would speed up the learning as wouldn be wasting time learning to differenciate on other elements After having done really badly differenciating between both sides of minimal pair Anki will sometimes end up repeating the same pair several times in row without other cards This really helps to learn to differentiate the two sounds when there is great difficulty Having the option to select pair to practice for short run when having difficulty with pair would help to speed up the learning process This could also be restricted to be only for new pairs or those that are close to the same results as chance only Just couple of ideas that think would be useful to learning,1
28096,1209284858,Иван спасибо Очень приятно Right now it is not supported Maybe sometime in future Yes it is possible And even more libs and server will be open source so indeed you can do anything you want,1
36520,1262352326,Have you packed the fifth batch Come on update where is the trust between people cinquième lot emballé SoH Ha mise à jour nuqDaq oH SARAH bogh Nuvvu Voq fünfte Partie verpackt SoH Ha update nuqDaq oH SARAH Bogh nuvvu Voq kvina parto pakis Soh Ha ĝisdatigo nuqDaq Ho Sara bogh nuvvu voq пятая партия упакованы SoH Ха обновление nuqDaq О Sarah bogh nuvvu Voq fifth batch packed SoH Ha update nuqDaq oH SabtaHbogh nuvpu voq,1
39159,1281269876,Hi I whole family owner but not happy one My mood is spoiled by suspect way of glow discharging think light indicator info of charge left is incorrect Also it charges my old th iphone times meaning it battery less Ah dissapointed with estimated efficient capacity about Аh assume not proper usage or bad maths so request some instructions of the best estimation of capacity without multimeters and other stuff Thanks in advance sorry if answered,1
57622,1405301865,Probably you can sum up At us in Russia it is called to spread on бабки The goods are not present money is not present It okay knew what signed,1
64230,1444409174,Circuit Scribe sent you message on th could you please reply τнänκ чöü,1
82102,1552094001,Would there update after you received all the survey really want to know all details about the project ӦӦ,1
82113,1552094001,Seeing the drop off Up with the sun special feel so uncomfortable Д,1
83601,1556727849,Hi my country is missing in this list Fedex upgrade option How can recieve my order Кирилл у вас получилось как то сдвинуть дело с мертвой точки,1


In [16]:
df = detect_lang_and_proba(df, column_name='X')
# Compter les langues différentes
num_languages = df["langue"].nunique()
print(f"Nombre de langues différentes : {num_languages}")

# Facultatif : afficher les langues détectées
print(df["langue"].value_counts(normalize=True))

Nombre de langues différentes : 56
langue
en    0.993625
de    0.001005
fr    0.000980
es    0.000717
it    0.000558
nl    0.000416
et    0.000353
da    0.000301
sv    0.000225
pt    0.000211
id    0.000149
pl    0.000118
no    0.000104
mg    0.000097
fi    0.000097
ro    0.000094
ca    0.000090
br    0.000083
cy    0.000066
mt    0.000066
cs    0.000066
la    0.000055
sl    0.000052
lt    0.000045
af    0.000042
tl    0.000035
eu    0.000031
xh    0.000028
ms    0.000028
vi    0.000021
tr    0.000021
oc    0.000017
nb    0.000017
sw    0.000017
vo    0.000014
ht    0.000014
ga    0.000014
wa    0.000014
eo    0.000010
nn    0.000010
gl    0.000010
ky    0.000007
jv    0.000007
rw    0.000007
se    0.000007
hr    0.000007
lv    0.000007
hu    0.000007
sq    0.000007
el    0.000007
lb    0.000003
ar    0.000003
qu    0.000003
az    0.000003
ru    0.000003
he    0.000003
Name: proportion, dtype: float64


In [35]:

# def translate_non_english(df):

#     translator = GoogleTranslator(source=df["langue"], target="en")
#     df["translations"]=translator.translate_batch(list(df["X"].values))

#     return df

# translated_df = translate_non_english(df)
from deep_translator import GoogleTranslator
import pandas as pd

def translate_non_english(df):
    translations = []

    for idx, row in df.iterrows():
        if row["langue"] != "en":
            try:
                translated = GoogleTranslator(source=row["langue"], target="en").translate(row["X"])
            except Exception as e:
                print(f"Erreur à la ligne {idx} : {e}")
                translated = row["X"]  # En cas d'erreur, on garde le texte original
        else:
            translated = row["X"]

        translations.append(translated)

    df["translations"] = translations
    return df
translated_df = translate_non_english(df)

Erreur à la ligne 2023 : br --> No support for the provided language.
Please select on of the supported languages:
{'afrikaans': 'af', 'albanian': 'sq', 'amharic': 'am', 'arabic': 'ar', 'armenian': 'hy', 'assamese': 'as', 'aymara': 'ay', 'azerbaijani': 'az', 'bambara': 'bm', 'basque': 'eu', 'belarusian': 'be', 'bengali': 'bn', 'bhojpuri': 'bho', 'bosnian': 'bs', 'bulgarian': 'bg', 'catalan': 'ca', 'cebuano': 'ceb', 'chichewa': 'ny', 'chinese (simplified)': 'zh-CN', 'chinese (traditional)': 'zh-TW', 'corsican': 'co', 'croatian': 'hr', 'czech': 'cs', 'danish': 'da', 'dhivehi': 'dv', 'dogri': 'doi', 'dutch': 'nl', 'english': 'en', 'esperanto': 'eo', 'estonian': 'et', 'ewe': 'ee', 'filipino': 'tl', 'finnish': 'fi', 'french': 'fr', 'frisian': 'fy', 'galician': 'gl', 'georgian': 'ka', 'german': 'de', 'greek': 'el', 'guarani': 'gn', 'gujarati': 'gu', 'haitian creole': 'ht', 'hausa': 'ha', 'hawaiian': 'haw', 'hebrew': 'iw', 'hindi': 'hi', 'hmong': 'hmn', 'hungarian': 'hu', 'icelandic': 'is', '

KeyboardInterrupt: 

In [16]:
df[(df["langue"] == "fr") ]#& (df["langue_proba"] <= -100)]

Unnamed: 0,id,X,y,langue,langue_proba
657,1002453633,Congrats guys,1,fr,-5.061021
4272,1021260036,Want my money back,1,fr,2.452273
6055,1039424225,Yessss congrats,1,fr,-5.061021
6776,1049459131,Cant ship battery,0,fr,-6.998726
7528,1055925159,AndrewS Brilliant idea,0,fr,-29.091976
...,...,...,...,...,...
279595,933236486,Congrats,1,fr,-5.061021
283325,960347124,Yay Congrats,1,fr,-5.061021
283985,967289704,ps Class action lawsuit,1,fr,-29.678188
284151,967479190,Me too Wait for your update,1,fr,-14.085238


In [17]:
preproc_df = pd.DataFrame(preproc_X)
preproc_df

Unnamed: 0,X
0,electrical solution group instal irninja parent house love use call every night could watch netflix would walk change tv source roku turn audio receiver start netflix anymore press one button
1,awesome product would like get one asap wish best luck
2,alex country tax fee really high send odin declare odin low value like chinese website would helpfull maybe people
3,watch think proposition think really honest mind impossible thank help daniel call morning rent projector take tomorrow thank help know possible rent projector quite expensive month rent pay new projector say msrp odin understand mean back project pay lot tax receive think spend less money msrp think spend money sure understand
4,monthieu laurent choice personal computer projector many problem company hardware without choice back project need portative projector stay home understand opinion want fight alex laptop need projector want buy hate spend money
...,...
288627,also receive bill fedex custom please refund amount
288628,get proper delivery expectation rest u matthew someone discussion post link use shoot harris teeter
288629,mitch team guy get california distribution point stock resume ship little funny say people favor ship directly actually save overhead cost sell usa everyone cool people go get feb th refund plus custom charge reimburse earlier thread answer yes suggest may want official update summarize policy well current project state backer fulfillment rest anyone get know soft source one comparison get different meter nothing color different intensity meter curious outright power compare know soft source also anyone get one location shoot eager play around one home think real info happen pay shoot develop real feeling light tool love hear real experience impression base
288630,receive bill fedex u tell would custom additional ship fee u customer please refund amount bill thank


In [18]:
# Filtrer les commentaires contenant du russe
preproc_df_russian = preproc_df[preproc_df['X'].apply(contains_russian)]

# Afficher combien et quelques exemples
print(f"Nombre de commentaires contenant du russe : {len(preproc_df_russian)}")
preproc_df_russian

Nombre de commentaires contenant du russe : 0


Unnamed: 0,X


In [19]:
df

Unnamed: 0,id,X,y,langue,langue_proba
0,1000468345,Electrical Solutions Group installed the iRNinja at my parents house and they love it They use to call me every night so that they could watch Netflix would have to walk them through changing the TV source to their Roku turning on their audio receiver and starting Netflix Not anymore now they just press one button,0,en,-815.663321
1,1000468345,Awesome product would like to get one asap Wish you best of luck,0,en,-115.733649
2,1000629643,Alex In my country taxes and fees are really high when you send the ODIN if you declare the ODIN having low value like chinese websites it would be very helpfull for me and maybe for more people,1,en,-454.736753
3,1000629643,Watching was thinking about your proposition and think it not really honest In my mind it impossible doing that but thank you for your help Daniel called this morning to rent projector and can take it tomorrow Thank you for your help didn know it was possible to rent projectors It quite expensive months renting and can pay new projector You said the MSRP of ODIN is don understand what you mean with that backed this project and will pay lot of taxes when will receive it think Because spent less money that the MSRP you think can spend more money now Not sure understand,1,en,-1488.646401
4,1000629643,Monthieu laurent It was my choice having my personals computer and projector Because had to many problems with the company hardware Without this choice wouldn back this project don need portative projector to stay home Do you understand my opinion don want to fight with you or with Alex have the laptop but need projector don want to buy an other hate spending money,1,en,-898.300369
...,...,...,...,...,...
288627,997256071,also received bill from FedEx for customs Please refund this amount,1,en,-168.741205
288628,997256071,for getting proper delivery expectation for the rest of us Matthew someone here in the discussion posted link to where they used it for shoot with Harris Teeter,1,en,-389.729891
288629,997256071,Mitch and team You guys should get your California distribution point stocked and resume shipping from there It is little funny to say you re doing people favor of shipping directly if you re actually saving yourselves an overhead cost of selling from the USA Is everyone cool on this Are people going to get the Feb th refund plus the customs charges reimbursed Earlier in the this thread the answer was yes but suggest you may want to have an official update to summarize policy as well as the current and projected state of backer fulfillment For the rest of you has anyone got KNOWN SOFT source up against one of these for comparison I ve got few different meters here nothing with color on it but few different intensity meters I very curious about outright power compared to known soft source Also has anyone got one of these on location shoot I eager to play around with one at home but think the real info happens on paid shoot That where you ll develop real feelings about the light as tool I love to hear real experience and impressions based on that,1,en,-2295.221485
288630,997256071,have received bill from FedEx and am in the US We were told there would be no customs or additional shipping fees for US customers Please refund me in the amount of this bill Thank you,1,en,-411.898226


In [20]:
df["X"].duplicated().sum()

np.int64(9457)

In [21]:
df[df["X"].duplicated()]

Unnamed: 0,id,X,y,langue,langue_proba
51,1000697657,Also what is the shelf life once charged If it significant see wonderful emergency lighting solution for storerooms etc,0,en,-179.069712
155,1000776914,An update would be nice,1,en,-53.316920
292,1001265769,Congratulations,1,en,-12.532950
293,1001265769,Congratulations,1,en,-12.532950
385,1001502333,Will the backers who already recieved their headsets get the updated face cushion headstrap and the nose pad together with the bases controllers,1,en,-424.329826
...,...,...,...,...,...
288337,996423439,invoke my rights under Kickstarter Terms of Use Project Creators are required to fulfill all rewards of their successful fundraising campaigns or refund any Backer whose reward they do not or cannot fulfill demand an immediate full refund for my pledge amount,1,en,-565.061130
288388,996423439,VARIABLE THICKNESS SLICING FOR PRINTERS With proper tuning any printer can create exceptionally detailed physical replicas of digital files The time it takes for printer to print an object at very high detail is another matter entirely The lower the layer height the more layers must be printed and the longer print takes to print,1,en,-688.174510
288391,996423439,Sounds good thanks,1,en,-17.379354
288397,996423439,Great,1,en,9.061840


In [22]:
nb_doublons = df.duplicated(subset='X').sum()
nb_doublons

np.int64(9457)

In [23]:
from collections import Counter
import re

preproc_X

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   electrical solution group instal irninja parent house love use call every night could watch netflix would walk change tv source roku turn audio receiver start netflix anymore press one button
1                                                                                                                                                                                                                                                                                                                                   

In [27]:

preproc_x_df = preproc_X.to_frame(name="prepoc_x")
preproc_x_df

Unnamed: 0,prepoc_x
0,electrical solution group instal irninja parent house love use call every night could watch netflix would walk change tv source roku turn audio receiver start netflix anymore press one button
1,awesome product would like get one asap wish best luck
2,alex country tax fee really high send odin declare odin low value like chinese website would helpfull maybe people
3,watch think proposition think really honest mind impossible thank help daniel call morning rent projector take tomorrow thank help know possible rent projector quite expensive month rent pay new projector say msrp odin understand mean back project pay lot tax receive think spend less money msrp think spend money sure understand
4,monthieu laurent choice personal computer projector many problem company hardware without choice back project need portative projector stay home understand opinion want fight alex laptop need projector want buy hate spend money
...,...
288627,also receive bill fedex custom please refund amount
288628,get proper delivery expectation rest u matthew someone discussion post link use shoot harris teeter
288629,mitch team guy get california distribution point stock resume ship little funny say people favor ship directly actually save overhead cost sell usa everyone cool people go get feb th refund plus custom charge reimburse earlier thread answer yes suggest may want official update summarize policy well current project state backer fulfillment rest anyone get know soft source one comparison get different meter nothing color different intensity meter curious outright power compare know soft source also anyone get one location shoot eager play around one home think real info happen pay shoot develop real feeling light tool love hear real experience impression base
288630,receive bill fedex u tell would custom additional ship fee u customer please refund amount bill thank


In [28]:
nb_doublons_preproc=preproc_x_df.duplicated(subset='prepoc_x').sum()
nb_doublons_preproc

np.int64(14297)

In [29]:
filtered_counter = Counter(preproc_x_df["prepoc_x"])
filtered_counter

Counter({'invoke right kickstarter term use project creator require fulfill reward successful fundraise campaign refund backer whose reward fulfill demand full refund pledge amount': 609,
         'update': 581,
         'still wait': 271,
         'congratulation': 241,
         'refund please': 187,
         'thank': 184,
         'receive reward please refund invoke right kickstarter term use project creator require fulfill reward successful fundraise campaign refund backer whose reward fulfill demand full refund pledge amount': 169,
         'update please': 164,
         '': 161,
         'receive order invoke right kickstarter term use project creator require fulfill reward successful fundraise campaign refund backer whose reward fulfill formal request refund pledge amount wish wait longer delivery update product want full refund pledge amount soon possible': 151,
         'ufunbrush everyone want refund po kickstarter item get hookup like guy think product go sound way get child

In [31]:
df_preproc_langue= preproc_x_df.copy()
df_preproc_langue = detect_lang_and_proba(df_preproc_langue, column_name='prepoc_x')
# Compter les langues différentes
num_languages = df_preproc_langue["langue"].nunique()
print(f"Nombre de langues différentes : {num_languages}")

# Facultatif : afficher les langues détectées
print(df_preproc_langue["langue"].value_counts())

Nombre de langues différentes : 59
langue
en    258357
nl      4307
fr      4182
it      3365
de      2793
es      2284
sv      2183
da      1119
af       871
id       836
ro       821
br       710
et       701
no       590
eu       547
pl       503
ca       448
pt       397
cy       317
mt       277
tl       252
fi       220
ms       210
nb       209
mg       172
eo       168
sw       148
sl       139
lt       136
xh       131
tr       128
wa       125
lb        82
nn        77
oc        75
ga        74
cs        70
hr        68
la        63
jv        47
lv        42
ht        40
sq        39
rw        39
hu        35
vo        33
gl        33
an        33
sk        31
zu        28
vi        21
bs        14
az        12
ku        10
se         9
qu         5
is         4
zh         1
fo         1
Name: count, dtype: int64


In [36]:
df_preproc_langue[df_preproc_langue["langue"]=="it"]

Unnamed: 0,prepoc_x,langue,langue_proba
97,guy scam artist one donate money crook,it,-27.924664
108,accord verge forthcoming partial port metro modernui bad get update people finance project,it,-62.314742
291,blind user io mac o delight back project promote inclusion,it,-68.060507
376,hello solve,it,-11.276844
537,suppose send email point store per se use yet send,it,-84.869794
...,...,...,...
288360,hello new kickstarter care little silence invest money information correct,it,-76.089967
288373,hi news,it,0.093283
288399,hi update,it,0.093283
288472,far extra fedex charge week,it,-7.805732


In [None]:







def cleaning(sentence):

    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers

    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation

    tokenized_sentence = word_tokenize(sentence) ## tokenize
    stop_words = set(stopwords.words('english')) ## define stopwords

    tokenized_sentence_cleaned = [ ## remove stopwords
        w for w in tokenized_sentence if not w in stop_words
    ]

    lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v")
        for word in tokenized_sentence_cleaned
    ]

    cleaned_sentence = ' '.join(word for word in lemmatized)

    return cleaned_sentence


In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from scipy.stats import uniform, randint

pipeline = make_pipeline(
    TfidfVectorizer(),
    MultinomialNB()
)

param_distributions = {
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidfvectorizer__max_df': uniform(0.7, 0.3),
    'tfidfvectorizer__min_df': randint(1, 5),
    'multinomialnb__alpha': uniform(0.0, 1.0)
}


random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=20,
    cv=5,
    scoring='recall',
    verbose=1,
    n_jobs=-1
)

random_search.fit(preproc_X, y)

print("Best params (RandomSearch):", random_search.best_params_)
print("Best recall (RandomSearch):", random_search.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best params (RandomSearch): {'multinomialnb__alpha': np.float64(0.9832308858067882), 'tfidfvectorizer__max_df': np.float64(0.8400288679743939), 'tfidfvectorizer__min_df': 1, 'tfidfvectorizer__ngram_range': (1, 3)}
Best recall (RandomSearch): 1.0


In [40]:
preproc_X

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   electrical solution group instal irninja parent house love use call every night could watch netflix would walk change tv source roku turn audio receiver start netflix anymore press one button
1                                                                                                                                                                                                                                                                                                                                   

In [43]:
vectorizer = TfidfVectorizer()
preproc_X_sample=preproc_X.sample(1000,random_state=2)
vectorized_documents = vectorizer.fit_transform(preproc_X_sample)
vectorized_documents = pd.DataFrame(
    vectorized_documents.toarray(),
    columns = vectorizer.get_feature_names_out()
)

vectorized_documents

Unnamed: 0,ab,abandon,abc,ability,able,abroad,absolute,absolutely,absurd,ac,...,zhengke,zip,zivix,zombie,zone,zoom,zspider,zungle,zwave,zx
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
