In [2]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [3]:
import pandas as pd
import spacy

df = pd.read_csv('train.csv')
nlp = spacy.load('en_core_web_sm')

In [9]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin

class textPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.fillna("")
        vals = X.apply(lambda row: ' '.join(row.values.astype(str)), axis=1).apply(self.process_text)
        
        vals2d = np.stack(vals.values)
        vals2dNormalized = MinMaxScaler().fit_transform(vals2d)
        print(vals2dNormalized[1].shape)
        # Apply the already fitted TF-IDF transformation
        return vals2dNormalized

    def process_text(self, text):
        doc = nlp(text)

        filtered_tokens = []
        for token in doc:
            if token.is_punct or token.is_stop:
                continue
            if token.like_url:
                filtered_tokens.append('URL')
            else:
                filtered_tokens.append(token.lemma_.lower())
                #0filtered_tokens.append(token.text.lower())
        return wv.get_mean_vector(filtered_tokens)

In [10]:

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

text_transformer = Pipeline(steps=[
    ('preprocessor', textPreprocessor())
])

text_columns = ['text', 'location', 'keyword']  # Ensure this list contains the correct columns
preprocessor = ColumnTransformer(
    transformers=[
        ('Text', text_transformer, text_columns)
    ])

clf = Pipeline([
    ('txt preprocessor', preprocessor),
    ('rf', RandomForestClassifier())
])

clf.fit(df[text_columns], df['target'])

(300,)


In [17]:

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

text_transformer = Pipeline(steps=[
    ('preprocessor', textPreprocessor())
])

text_columns = ['text', 'location', 'keyword']  # Ensure this list contains the correct columns
preprocessor = ColumnTransformer(
    transformers=[
        ('Text', text_transformer, text_columns)
    ])

clf = Pipeline([
    ('txt preprocessor', preprocessor),
    ('rf', LogisticRegression())
])

clf.fit(df[text_columns], df['target'])

(300,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

text_transformer = Pipeline(steps=[
    ('preprocessor', textPreprocessor())
])

text_columns = ['text', 'location', 'keyword']  # Ensure this list contains the correct columns
preprocessor = ColumnTransformer(
    transformers=[
        ('Text', text_transformer, text_columns)
    ])

clf = Pipeline([
    ('txt preprocessor', preprocessor),
    ('rf', KNeighborsClassifier())
])

clf.fit(df[text_columns], df['target'])

(300,)


In [15]:
from tensorflow import keras
from scikeras.wrappers import KerasRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Create a basic neural network model
model = keras.Sequential([
    keras.layers.Dense(32, activation='relu', input_shape=(300,)),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')  
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

kr = KerasRegressor(model=model, epochs=500, batch_size=32, verbose=1)

text_transformer = Pipeline(steps=[
    ('preprocessor', textPreprocessor())
])

text_columns = ['text', 'location', 'keyword']  # Ensure this list contains the correct columns
preprocessor = ColumnTransformer(
    transformers=[
        ('Text', text_transformer, text_columns)
    ])

clf = Pipeline([
    ('txt preprocessor', preprocessor),
    ('kr', kr)
])

clf.fit(df[text_columns], df['target'])



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


(300,)
Epoch 1/500
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 519us/step - accuracy: 0.5885 - loss: 0.6687
Epoch 2/500
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 489us/step - accuracy: 0.7006 - loss: 0.5830
Epoch 3/500
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 481us/step - accuracy: 0.7510 - loss: 0.5312
Epoch 4/500
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 489us/step - accuracy: 0.7445 - loss: 0.5284
Epoch 5/500
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 502us/step - accuracy: 0.7571 - loss: 0.5095
Epoch 6/500
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 506us/step - accuracy: 0.7630 - loss: 0.5031
Epoch 7/500
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 485us/step - accuracy: 0.7446 - loss: 0.5240
Epoch 8/500
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 531us/step - accuracy: 0.7660 - loss: 0.4984
Epoch 9/5

In [20]:
df_test = pd.read_csv('test.csv')
y_pred = clf.predict(df_test[text_columns])
y_pred = np.round(y_pred).astype(int)
d = {'id': df_test.id, 'target': y_pred}
df_out = pd.DataFrame(data=d)
df_out.to_csv('submission.csv', index=False)

!kaggle competitions submit -c nlp-getting-started -f submission.csv -m "Gensim attempt = random forest"
!kaggle competitions submissions -c nlp-getting-started

(300,)
400 - Bad Request - Submission not allowed:  Your team has used its daily Submission allowance (5) today, please try again tomorrow UTC (2.4 hours from now).



  0%|          | 0.00/25.4k [00:00<?, ?B/s]
 63%|██████▎   | 16.0k/25.4k [00:00<00:00, 80.3kB/s]
100%|██████████| 25.4k/25.4k [00:00<00:00, 53.5kB/s]


fileName        date                 description                            status    publicScore  privateScore  
--------------  -------------------  -------------------------------------  --------  -----------  ------------  
submission.csv  2024-09-30 21:36:19  Gensim attempt = Logistic Regression   complete  0.74900                    
submission.csv  2024-09-30 21:32:33  Gensim attempt = keras neural network  complete  0.74471                    
submission.csv  2024-09-30 21:29:40  Gensim attempt = keras neural network  complete  0.73827                    
submission.csv  2024-09-30 21:28:49  Gensim attempt = random forest         complete  0.00000                    
submission.csv  2024-09-30 21:07:09  Gensim attempt = random forest         complete  0.73030                    
submission.csv  2024-08-09 20:37:42  Bag of words attempt - RandomForest    complete  0.79773                    
submission.csv  2024-08-09 17:49:33  Bag of words attempt - RandomForest    complete  0.