In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


In [3]:
# Load datasets
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [4]:
# Display the first few rows of the datasets
print("Train Data:")
print(train_df.head())


Train Data:
   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [5]:
print("\nTest Data:")
print(test_df.head())



Test Data:
   id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
# Check for missing values
print("\nMissing Values in Train Data:")
print(train_df.isnull().sum())




Missing Values in Train Data:
id             0
keyword       61
location    2533
text           0
target         0
dtype: int64


In [7]:
print("\nMissing Values in Test Data:")
print(test_df.isnull().sum())




Missing Values in Test Data:
id             0
keyword       26
location    1105
text           0
dtype: int64


In [8]:

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


In [9]:

def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text


In [10]:
# Apply cleaning function
train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text'] = test_df['text'].apply(clean_text)



In [11]:
# Preview cleaned data
print(train_df[['text', 'clean_text']].head())



                                                text  \
0  Our Deeds are the Reason of this #earthquake M...   
1             Forest fire near La Ronge Sask. Canada   
2  All residents asked to 'shelter in place' are ...   
3  13,000 people receive #wildfires evacuation or...   
4  Just got sent this photo from Ruby #Alaska as ...   

                                          clean_text  
0       deeds reason earthquake may allah forgive us  
1              forest fire near la ronge sask canada  
2  residents asked shelter place notified officer...  
3  people receive wildfires evacuation orders cal...  
4  got sent photo ruby alaska smoke wildfires pou...  


In [12]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer()



In [13]:
# Transform the clean_text column
X_train = tfidf.fit_transform(train_df['clean_text'])
X_test = tfidf.transform(test_df['clean_text'])


In [14]:
# Target variable
y_train = train_df['target']


In [15]:
# Initialize the model
model = LogisticRegression()


In [16]:
# Train the model
model.fit(X_train, y_train)


In [17]:
# Evaluate on the training set
y_pred = model.predict(X_train)
print("F1 Score (Training):", f1_score(y_train, y_pred))


F1 Score (Training): 0.8546374367622259


In [18]:
# Predict on test data
test_df['target'] = model.predict(X_test)


In [19]:
# Prepare the submission file
submission = test_df[['id', 'target']]
submission.to_csv('submission.csv', index=False)


In [20]:
print("Submission file created: submission.csv")

Submission file created: submission.csv
