## Term Frequency-Inverse Document Frequency (TF-IDF)

TF = count of terms / total count of terms
IDF = log (total # of documents / number of documents t is present in)

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import spacy

In [2]:
df = pd.read_csv("train.csv")
nlp = spacy.load("en_core_web_sm")
df = df.fillna("NA")

In [3]:
Xt = df[['keyword', 'location', 'text']]
yt = df['target']
X_train, X_test, y_train, y_test = train_test_split(Xt, yt, test_size=0.3)

In [65]:
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

class textPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)

    def fit(self, X, y=None):
        X = X.fillna("")
        
        # Combine text from all columns into one new column
        X['combined_text'] = X.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
        
        # Fit TF-IDF on the combined column
        self.tfidf.fit(X['combined_text'].values)
        return self

    def transform(self, X):
        X = X.fillna("")
        
        # Combine text from all columns into one new column
        for col in X.columns:
            X[col] = X[col].apply(self.process_text)
        X['combined_text'] = X.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
        print(X.head())
        # Apply the already fitted TF-IDF transformation
        X_tfidf = self.tfidf.transform(X['combined_text'].values)
        
        return X_tfidf  # Return the sparse matrix of TF-IDF features

    def process_text(self, text):
        doc = nlp(text)  # Process the string with the NLP model
        no_stop_words = [token.text for token in doc if not token.is_stop]
        return " ".join(no_stop_words)


text_transformer = Pipeline(steps=[
    ('preprocessor', textPreprocessor())
])

text_columns = ['text', 'location', 'keyword']  # Ensure this list contains the correct columns
preprocessor = ColumnTransformer(
    transformers=[
        ('Text', text_transformer, text_columns)
    ])

clf = Pipeline([
    ('txt preprocessor', preprocessor),
    ('rf', RandomForestClassifier())
])

# Ensure Xt_transformed and y_train have consistent sample sizes
clf.fit(X_train, y_train)  # Ensure X_train contains the correct columns


                                                   text       location  \
5980  like little boy better sit ass stop screaming ...      nap queen   
2350  chance work old FFA foodstand fairgrounds . fi...  Atlantic , IA   
5052  @Cali74142290 lol natural disaster / hospital ...         Canada   
766   Catfish retweeted & amp ; notifications blew ?...             NA   
1648  Petition | Heartless owner whipped horse colla...            USA   

                 keyword                                      combined_text  
5980           screaming  like little boy better sit ass stop screaming ...  
2350          demolition  chance work old FFA foodstand fairgrounds . fi...  
5052  natural%20disaster  @Cali74142290 lol natural disaster / hospital ...  
766            blew%20up  Catfish retweeted & amp ; notifications blew ?...  
1648           collapsed  Petition | Heartless owner whipped horse colla...  


In [66]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

                                                   text location  \
4821  @noah_anyname concentration camps mass murder ...       NA   
6089  @supernovalester feel bad . literally feel fee...       NA   
6341  Rightways : Building structural integrity & am...     Asia   
4249  @rawfoodbliss middle humid heat wave patch for...       NA   
5043  bae soak mudslide backroom ? ? ? ? # thisiswhy...       NA   

                   keyword                                      combined_text  
4821         mass%20murder  @noah_anyname concentration camps mass murder ...  
6089               sinking  @supernovalester feel bad . literally feel fee...  
6341  structural%20failure  Rightways : Building structural integrity & am...  
4249           heat%20wave  @rawfoodbliss middle humid heat wave patch for...  
5043              mudslide  bae soak mudslide backroom ? ? ? ? # thisiswhy...  
              precision    recall  f1-score   support

           0       0.77      0.90      0.83      1353
  

: 