In [2]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [19]:
indie_df = pd.read_csv('Data/indiegogo_2.csv')

In [None]:
#create target value column, based on funded_percent
indie_df['funded_percent'] = indie_df['funded_percent'].str.replace('%', '').astype(float)
indie_df['is_success'] = (indie_df['funded_percent'] >= 80).astype(int)

In [None]:
#add day of the week column, based on launch date
from datetime import datetime
indie_df['date_launch'] = pd.to_datetime(indie_df['date_launch'])
day_names = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
indie_df['day_of_week'] = indie_df['date_launch'].apply(lambda x: day_names[x.weekday()])

In [None]:
#dummifying 
day_dummies = pd.get_dummies(indie_df['day_of_week'])
indie_df = pd.concat([indie_df, day_dummies], axis=1)
indie_df.columns = indie_df.columns.str.lower()

In [None]:
#indie_df = pd.to_csv('Data/indiepygo_ready.csv') #save pdf

In [20]:
df = pd.read_csv('Data/IndiePyGo_nulldropped.csv')

In [21]:
indie_text_df = df[['title', 'tagline', 'is_success']]
indie_text_df

Unnamed: 0,title,tagline,is_success
0,Join the Electric Revolution!!!,Pure electric motorcycle proves a powerful alt...,0
1,Relief Trip to Haiti,Send Me to Haiti...I'm needed there!,0
2,"Out To Reach Leogane, Haiti 2010",Haiti Relief Mission to Leogane,0
3,The Transpersonal Papers: 1861-2010,My third book on Fezziwig Press.,0
4,Homeless Veterans need a Hand UP not Hand Out!,Homeless Veterans Transitional Housing Develop...,0
...,...,...,...
20613,Totally Gay Productions,Trans filmmaker making queer documentaries. It...,0
20614,Lady Crow debut EP,"Help us finance Every Stone, our first EP!",0
20615,"JOSA, the ultimate Venice-Inspired bracelet","Benefits Save Venice, helps to protect Venice'...",0
20616,My Campaign Title,Hi,0


In [22]:
indie_text_df['text'] = indie_text_df['title'] + ' ' + indie_text_df['tagline']
indie_text_df['text']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  indie_text_df['text'] = indie_text_df['title'] + ' ' + indie_text_df['tagline']


0        Join the Electric Revolution!!! Pure electric ...
1        Relief Trip to Haiti Send Me to Haiti...I'm ne...
2        Out To Reach Leogane, Haiti 2010 Haiti Relief ...
3        The Transpersonal Papers: 1861-2010 My third b...
4        Homeless Veterans need a Hand UP not Hand Out!...
                               ...                        
20613    Totally Gay Productions Trans filmmaker making...
20614    Lady Crow debut EP Help us finance Every Stone...
20615    JOSA, the ultimate Venice-Inspired bracelet Be...
20616                                 My Campaign Title Hi
20617                                 My Campaign Title Hi
Name: text, Length: 20618, dtype: object

In [23]:
indie_text_df.drop(['title', 'tagline'], axis=1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  indie_text_df.drop(['title', 'tagline'], axis=1, inplace=True)


In [24]:
indie_text_df

Unnamed: 0,is_success,text
0,0,Join the Electric Revolution!!! Pure electric ...
1,0,Relief Trip to Haiti Send Me to Haiti...I'm ne...
2,0,"Out To Reach Leogane, Haiti 2010 Haiti Relief ..."
3,0,The Transpersonal Papers: 1861-2010 My third b...
4,0,Homeless Veterans need a Hand UP not Hand Out!...
...,...,...
20613,0,Totally Gay Productions Trans filmmaker making...
20614,0,Lady Crow debut EP Help us finance Every Stone...
20615,0,"JOSA, the ultimate Venice-Inspired bracelet Be..."
20616,0,My Campaign Title Hi


In [18]:
indie_text_df.to_csv('indiepygo_text_fornlp.csv', index=False)

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    indie_text_df.text, 
    indie_text_df.is_success, 
    test_size=0.2, 
    random_state=2022,
    stratify=indie_text_df.is_success
)

In [22]:
y_train.value_counts()

0    12826
1     3668
Name: is_success, dtype: int64

In [23]:
y_test.value_counts()

0    3207
1     917
Name: is_success, dtype: int64

In [3]:
from sklearn.pipeline import Pipeline

In [26]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('Random Forest', RandomForestClassifier())         
])


clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)



print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.98      0.88      3207
           1       0.66      0.12      0.21       917

    accuracy                           0.79      4124
   macro avg       0.73      0.55      0.54      4124
weighted avg       0.77      0.79      0.73      4124



In [27]:
from sklearn.naive_bayes import MultinomialNB


clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('Multi NB', MultinomialNB())         
])


clf.fit(X_train, y_train)



y_pred = clf.predict(X_test)



print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.99      0.88      3207
           1       0.76      0.06      0.11       917

    accuracy                           0.79      4124
   macro avg       0.77      0.53      0.50      4124
weighted avg       0.78      0.79      0.71      4124



In [28]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('KNN', KNeighborsClassifier())         
])


clf.fit(X_train, y_train)



y_pred = clf.predict(X_test)



print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      1.00      0.88      3207
           1       0.79      0.03      0.06       917

    accuracy                           0.78      4124
   macro avg       0.79      0.51      0.47      4124
weighted avg       0.79      0.78      0.69      4124



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [9]:
indie_text_df['text'] = indie_text_df['text'].str.lower()
indie_text_df['text'] = indie_text_df['text'].str.replace('[^\w\s]','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  indie_text_df['text'] = indie_text_df['text'].str.lower()
  indie_text_df['text'] = indie_text_df['text'].str.replace('[^\w\s]','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  indie_text_df['text'] = indie_text_df['text'].str.replace('[^\w\s]','')


In [10]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
indie_text_df['text'] = indie_text_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juliettedegoul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  indie_text_df['text'] = indie_text_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))


In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    indie_text_df.text, 
    indie_text_df.is_success, 
    test_size=0.2, 
    random_state=2022,
    stratify=indie_text_df.is_success
)

In [33]:
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('Random Forest', RandomForestClassifier())         
])


clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)



print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.98      0.88      3207
           1       0.66      0.16      0.26       917

    accuracy                           0.79      4124
   macro avg       0.73      0.57      0.57      4124
weighted avg       0.77      0.79      0.74      4124



In [15]:
from gensim.models import Word2Vec
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
     ('Word2Vec',Word2Vec(sentences=indie_text_df['text'], vector_size=100, window=5, min_count=1, workers=4)),    
     ('Random Forest', RandomForestClassifier())         
])


clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)



print(classification_report(y_test, y_pred))


TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'Word2Vec(vocab=425, vector_size=100, alpha=0.025)' (type <class 'gensim.models.word2vec.Word2Vec'>) doesn't

In [28]:
def get_sentence_embedding(w2v_model, sentence):
    """
    Compute the sentence embedding using the average of word embeddings.
    """
    words = sentence.split()
    words_vectors = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vectors) == 0:
        return np.zeros(w2v_model.vector_size)
    else:
        return np.mean(words_vectors, axis=0)

In [29]:
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import pandas as pd


# Train/test split
X_train, X_test, y_train, y_test = train_test_split(indie_text_df['text'], indie_text_df['is_success'], test_size=0.2, random_state=42, stratify=indie_text_df['is_success'])

# Train Word2Vec
w2v_model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4)

# Transform the text data to embeddings
X_train_embeddings = [get_sentence_embedding(w2v_model, sentence) for sentence in X_train]
X_test_embeddings = [get_sentence_embedding(w2v_model, sentence) for sentence in X_test]

# Fit the random forest model
rf = RandomForestClassifier()
rf.fit(X_train_embeddings, y_train)

# Predict on the test set
y_pred = rf.predict(X_test_embeddings)

# Print the classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.78      0.99      0.87      3207
           1       0.48      0.04      0.07       917

    accuracy                           0.78      4124
   macro avg       0.63      0.51      0.47      4124
weighted avg       0.72      0.78      0.70      4124

