## Importing & Background Analysis

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Machine Learning/train.csv")

In [None]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
df.shape

(7613, 5)

In [None]:
df.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [None]:
df.dtypes

Unnamed: 0,0
id,int64
keyword,object
location,object
text,object
target,int64


In [None]:
df.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [None]:
df.isna().sum()

Unnamed: 0,0
id,0
keyword,61
location,2533
text,0
target,0


## Cleaning

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # for lemmatizer
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

def clean_text(text):
    if not isinstance(text, str):
        return ""

    # Tokenize using TweetTokenizer
    tokens = tokenizer.tokenize(text)

    # Remove URLs
    tokens = [re.sub(r'http\S+|www\S+', '', word) for word in tokens]

    # Remove numbers, punctuations (but preserve hashtags/mentions if needed)
    tokens = [re.sub(r'[^a-zA-Z#@]', '', word) for word in tokens if word]

    # Remove stopwords and short tokens
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]

    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# apply text cleaning
print("Cleaning the text data...")
df['cleaned_text'] = df['text'].apply(clean_text)

Cleaning the text data...


## Model Testing

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
# NLP vectorization
# using TF-IDF vectorizer with unigrams and bigrams
print("Vectorizing the text data...")
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X = tfidf_vectorizer.fit_transform(df['text'])
y = df['target']  # Using 'Teacher' as the target label

Vectorizing the text data...


In [None]:
# getting feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

In [None]:
from sklearn.model_selection import cross_val_score

Random Forrest

In [None]:
# Random Forrest

# import libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# instantiate the model
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

# perform cross-validation
cv_scores_rf = cross_val_score(rf, X, y, cv=5, scoring='accuracy')

# print the mean accuracy
print("Random Forest Cross-Validation Accuracy:", cv_scores_rf.mean())

Random Forest Cross-Validation Accuracy: 0.701829848585379


XGBoost

In [None]:
# XGBoost

# import libraries
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

# instantiate the model
xgb = XGBClassifier()

# perform cross-validation
cv_scores_xgb = cross_val_score(xgb, X, y, cv=5, scoring='accuracy')

# print the mean accuracy
print("XGBoost Cross-Validation Accuracy:", cv_scores_xgb.mean())

XGBoost Cross-Validation Accuracy: 0.698941417753017


Logistic Regression

In [None]:
# Logisitic Regression

# import libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# instantiate the model
logreg = LogisticRegression(
    solver='liblinear',
    C=0.5,
    penalty='l2',
    max_iter=1000,
    random_state=42
)

# perform cross-validation
cv_scores_logreg = cross_val_score(logreg, X, y, cv=5, scoring='accuracy')

# print the mean accuracy
print("Logistic Regression Cross-Validation Accuracy:", cv_scores_logreg.mean())

Logistic Regression Cross-Validation Accuracy: 0.7319119967765398


StratifiedKFold

In [None]:
# model that chatgpt suggested to try

from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)
cv_scores_rf = cross_val_score(rf, X, y, cv=skf, scoring='accuracy')
print("StratifiedKFoldL", cv_scores_rf.mean())

StratifiedKFoldL 0.701829848585379


### TF‑IDF Vectorizer

In [None]:
# imports
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorizer with n-grams (1-2)
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)
X = tfidf.fit_transform(df['text'])
y = df['target']

Random Forrest

In [None]:
# random Forest
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)
rf_scores = cross_val_score(rf, X, y, cv=skf, scoring='accuracy')
print("Random Forest Accuracy:", rf_scores.mean())

Random Forest Accuracy: 0.6794987588470436


XGBoost

In [None]:
# XGBoost
xgb = XGBClassifier()
xgb_scores = cross_val_score(xgb, X, y, cv=skf, scoring='accuracy')
print("XGBoost Accuracy:", xgb_scores.mean())

XGBoost Accuracy: 0.6953946624814604


## Kaggle Submission

In [None]:
test = pd.read_csv("/content/drive/MyDrive/Machine Learning/test.csv")

In [None]:
# reuse your clean_text() function
test['cleaned_text'] = test['text'].apply(clean_text)

In [None]:
# TF-IDF Vectorization on the training data
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# fit the vectorizer only on the training data
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)
tfidf.fit(df['cleaned_text'])  # fit only on the training data

In [None]:
# transform the training and test sets
X_train = tfidf.transform(df['cleaned_text'])  # Training data transformed
X_test = tfidf.transform(test['cleaned_text'])  # Test data transformed

In [None]:
y_train = df['target']  # target variable for training

In [None]:
# train the Logistic Regression model
from xgboost import XGBClassifier
xgb = XGBClassifier()

# refit model on full training data (after CV)
xgb.fit(X_train, y_train)

In [None]:
# make predictions on the test set (Kaggle test data)
y_pred = xgb.predict(X_test)

In [None]:
submission = pd.DataFrame({
    'id': test['id'],      # isolating ID column from the test set
    'target': y_pred       # model's predictions (0 or 1)
})

In [None]:
# Save the submission DataFrame to a CSV file
submission.to_csv("submission.csv", index=False)

# Download the file to your local machine
from google.colab import files
files.download("submission.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>