In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/sample-submission/sample_submission.csv


In [2]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

# Overview of what we will do: 

1. **Data preprocessing**: Cleaning the tweets by removing irrelevant elements like hashtags, mentions, and URLs.
2. **Tokenization**: Breaking down the tweets into individual words or tokens.
3. **Vectorization**: Converting the text into numerical data (using techniques like Bag of Words or TF-IDF).
4. **Model building**: Using machine learning models (e.g., logistic regression, Naive Bayes) to classify the tweets as related to disasters or not.
5. **Evaluation**: Assessing model performance using accuracy, precision, recall, etc.


# Why did i choose those libraries? 

- **`numpy`**: Great for numerical operations, though for most tasks in NLP, we might not need it much beyond array manipulations.
- **`pandas`**: Essential for handling and preprocessing your dataset (like loading CSV files, handling missing data, etc.).
- **`sklearn.feature_extraction`**: we will likely use this for text vectorization, like **TF-IDF** or **CountVectorizer** (we will use in this project CountVectorizer), which is perfect for converting your tweet data into a format (numercial format) suitable for machine learning models.
- **`sklearn.linear_model`**: Good choice for basic classifiers like **Logistic Regression**, which is commonly used for binary classification tasks (disaster vs. non-disaster).
- **`sklearn.model_selection`**: This will help us with tasks like **train/test splitting** and **cross-validation** to evaluate your model's performance.
- **`sklearn.preprocessing`**: Useful for scaling, encoding, or transforming data, although we might not need much preprocessing beyond text vectorization in this case.


# What is the difference between TF-IDF and CountVectorizer??

Both **CountVectorizer** and **TF-IDF** are techniques used to convert text data into numerical features, but they do it in slightly different ways:

### 1. **CountVectorizer**:
- **What it does**: It converts text into a matrix of token counts.
- **How it works**: Each document (in your case, each tweet) is represented as a row in a matrix, and each word (or token) in the entire dataset is represented as a column. The value in each cell is the count of how many times the word appears in that tweet.
- **Example**: 
  - Tweet 1: "Disaster struck city"
  - Tweet 2: "Disaster response team"
  
  The vocabulary might be: `["Disaster", "struck", "city", "response", "team"]`. The matrix will look like this:
  
  ```
  Tweet 1: [1, 1, 1, 0, 0]
  Tweet 2: [1, 0, 0, 1, 1]
  ```

- **Pros**: Simple and easy to implement.
- **Cons**: It treats all words equally, so common words (like "the," "and") can dominate the matrix, and it doesn't account for word importance.

### 2. **TF-IDF (Term Frequency-Inverse Document Frequency)**:
- **What it does**: It converts text into a matrix that reflects how important each word is, both in individual documents and across the whole dataset.
- **How it works**:
  - **TF (Term Frequency)**: The number of times a word appears in a tweet, just like CountVectorizer.
  - **IDF (Inverse Document Frequency)**: A measure of how rare a word is across all tweets. Words that appear frequently in many tweets (like "disaster") get a lower score because they're less informative.
  
  The final score for each word is computed as:
  \[
  \text{TF-IDF} = \text{TF} \times \text{IDF}
  \]
  
  So, words that appear frequently in one tweet but rarely in others get higher importance, while common words get lower importance.

- **Example**:
  - In the above example, "Disaster" appears in both tweets, so its importance would be reduced in TF-IDF, while words like "struck" and "response" may get higher weights since they are less frequent.

- **Pros**: It downweights common words and highlights rare but important words.
- **Cons**: Slightly more complex than CountVectorizer and may require more tuning.

### When to Use Which?
- **CountVectorizer**: Use it when you want a simple and straightforward approach, especially for models that can handle high-dimensional sparse data, like Naive Bayes.
- **TF-IDF**: Use it when you want to capture the importance of words and avoid over-representing common words in your dataset, especially in classification tasks where the meaning of words (not just their presence) matters.

For your disaster tweet classification task, **TF-IDF** might be a better choice because it helps to highlight the less common but more informative words that are related to disasters. Would you like to try implementing TF-IDF first?

# 1. Loading our data

In [3]:
full_train_df=pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
full_test_df=pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

# 2. Data exploring and visualisation

In [4]:
print("The first exampls of the data frame\n",full_train_df.head())
print("\nThe last exampls of the data frame\n", full_train_df.tail())
print("\nColumns of our full data frame\n", full_train_df.columns)
#the real disaster tweets
print("\nfive real disaster tweets: \n", full_train_df[full_train_df['target']==1]['text'].values[0:5])
print("\nfive not real disaster tweets: \n", full_train_df[full_train_df['target']==0]['text'].values[0:5])

print("Statistics of our full data fram", full_train_df.describe())


#the number of real disaster and not real (important information to know if we need to balance the model of not )
nb_real_dis=full_train_df[full_train_df['target']==1]['id'].count()
nb_Notreal_dis=full_train_df[full_train_df['target']==0]['id'].count()
print("The number of real disters is ",nb_real_dis )
print("The number of not real disaters is ", nb_Notreal_dis)

The first exampls of the data frame
    id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  

The last exampls of the data frame
          id keyword location  \
7608  10869     NaN      NaN   
7609  10870     NaN      NaN   
7610  10871     NaN      NaN   
7611  10872     NaN      NaN   
7612  10873     NaN      NaN   

                                                   text  target  
7608  Two giant cranes holding a bridge collapse int...       1  
7609  @aria_ahrary @TheTawniest The out of control w...       1  


*i think for the feature ingeeniring we will use the columns: keyword, location, text, target*

# 3. Obtain target and predictors


In [5]:
print(full_test_df.columns)

Index(['id', 'keyword', 'location', 'text'], dtype='object')


In [6]:
Y=full_train_df.target
features=["keyword", "location", "text"]
train_df=pd.DataFrame(full_train_df[features])
test_df=pd.DataFrame(full_test_df[features])
#NOTE: FOR MODIFICATION IT'S NOT POSSIBLE TO MODIFY ON A COPY SO IT'S BETTER TO CREATE A NEW DATAFRAME
#train_df=full_train_df[features] 
#test_df=full_test_df[features]

print("train_df.head() is : \n",train_df.head())
print("Statistical description of the train data frame: \n", train_df.describe())

print("test_df.head() is : \n",test_df.head())
print("Statistical description of the test data frame: \n", test_df.describe())
#from the privious prints we will see some of the columns in train_df and test_df contain missing values
print("let's see if there is have more details about the messing values in the data frames train_df and test_df: \n")

cols_with_missing_train =[col for col in train_df.columns if train_df[col].isnull().any()] 
print("  ->cols with missing values in train_df", cols_with_missing_train )
nb_missing_values_per_col_train=train_df[cols_with_missing_train].isnull().sum()
print("  ->The number of missing values in each column above in the train_df is:\n", nb_missing_values_per_col_train)

cols_with_missing_test =[col for col in test_df.columns if test_df[col].isnull().any()] 
print("  ->cols with missing values in test_df", cols_with_missing_test )
nb_missing_values_per_col_test=test_df[cols_with_missing_test].isnull().sum()
print("  ->The number of missing values in each column above in the test_df is:\n", nb_missing_values_per_col_test)

train_df.head() is : 
   keyword location                                               text
0     NaN      NaN  Our Deeds are the Reason of this #earthquake M...
1     NaN      NaN             Forest fire near La Ronge Sask. Canada
2     NaN      NaN  All residents asked to 'shelter in place' are ...
3     NaN      NaN  13,000 people receive #wildfires evacuation or...
4     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...
Statistical description of the train data frame: 
            keyword location                                               text
count         7552     5080                                               7613
unique         221     3341                                               7503
top     fatalities      USA  11-Year-Old Boy Charged With Manslaughter of T...
freq            45      104                                                 10
test_df.head() is : 
   keyword location                                               text
0     NaN      NaN 

# 4. Handling missing values

As we have seen in privious analysis we have many null values.

To handle the missing values in our data frame we will use the methode of Extanded imputation: 
An Extension To Imputation: In this approach, we impute the missing values (ie replace theme with something else like a mean, or an other text..etc). And, additionally, for each column with missing entries in the original dataset, we add a new column that shows the location of the imputed entries.
So also here we use the same process as in the previous, but we add some steps: adding the columns indicating where the imputed values.

**BUT**, to use the methode of imputer, using the SimpleImputer, the columns must be numerical however in our data frames the columns having missing values have string type.

So, we are going to follow the same process but instead of using the simpleImputer (that replaces Null with the mean) we will do it manually and we will replace the null values by a "unkown_location", "unknowen_keyword" for missing location and missing keyword.


**NOTE**: **SimpleImputer** is a class in the scikit-learn library that provides basic strategies for imputing missing values

> **A mistake: **
Using SimpleImputer with String Columns
You can still use SimpleImputer with string columns if you specify the strategy parameter correctly. For example:

In [7]:
#do not run the cell
######################################### remarque ###################################################
#from sklearn.impute import SimpleImputer

# Create the imputer with a constant fill value
#imputer = SimpleImputer(strategy='constant', fill_value='no_keyword')

# Apply imputer to the 'keyword' column (ensure it's 2D by using double brackets)
#train_df['keyword'] = imputer.fit_transform(train_df[['keyword']])
#test_df['keyword'] = imputer.transform(test_df[['keyword']])

# For the 'location' column
#imputer = SimpleImputer(strategy='constant', fill_value='no_location')
#train_df['location'] = imputer.fit_transform(train_df[['location']])
#test_df['location'] = imputer.transform(test_df[['location']])

########################################## fin de la remarque ########################################""


In [8]:
#befor imputing the null values and adding new columns
#we should do a copy of our original data frames
#i am not going to do it because i already have full_train_df and full_test_df

#we will add columns that indicate if there is a missing value or not 
for col in cols_with_missing_train: 
    train_df[col+'_was_missing']=train_df[col].isnull()
    test_df[col+'_was_missing']=test_df[col].isnull()
#impoutation 
train_df['keyword'] = train_df['keyword'].fillna('no_keyword')
train_df['location'] = train_df['location'].fillna('no_location')

test_df['keyword'] = test_df['keyword'].fillna('no_keyword')
test_df['location'] = test_df['location'].fillna('no_location')


let's check the null values again

In [9]:

cols_with_missing_train =[col for col in train_df.columns if train_df[col].isnull().any()] 
print("  ->cols with missing values in train_df", cols_with_missing_train )
nb_missing_values_per_col_train=train_df[cols_with_missing_train].isnull().sum()
print("  ->The number of missing values in each column above in the train_df is:\n", nb_missing_values_per_col_train)

cols_with_missing_test =[col for col in test_df.columns if test_df[col].isnull().any()] 
print("  ->cols with missing values in test_df", cols_with_missing_test )
nb_missing_values_per_col_test=test_df[cols_with_missing_test].isnull().sum()
print("  ->The number of missing values in each column above in the test_df is:\n", nb_missing_values_per_col_test)

  ->cols with missing values in train_df []
  ->The number of missing values in each column above in the train_df is:
 Series([], dtype: float64)
  ->cols with missing values in test_df []
  ->The number of missing values in each column above in the test_df is:
 Series([], dtype: float64)


let's verify if the new columns a well added

In [10]:
train_df.columns
test_df.columns

Index(['keyword', 'location', 'text', 'keyword_was_missing',
       'location_was_missing'],
      dtype='object')

# 5. Data preprocessing: Cleaning the tweets by removing irrelevant elements like hashtags, mentions, and URLs.

Problem: Tweets often contain special characters, hashtags, mentions, and URLs that can affect the model.

Suggestions: Clean up the text column to remove unnecessary elements like URLs, mentions (@username), hashtags, and convert everything to lowercase.

In [11]:
import re

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags
    return text

train_df['text'] = train_df['text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)

# 4. Building vectors

Let's dive deeper into the two parts: vectorizing using TF-IDF and CountVectorizer and modeling with RidgeClassifier.

*# **1. Vectorizing with TF-IDF and CountVectorizer**
*Both TF-IDF (Term Frequency - Inverse Document Frequency) and CountVectorizer are ways to convert text data into a numerical format that can be used by machine learning models.

* ****CountVectorizer:
What it does: It simply counts the number of times each word (or token) appears in each document (in your case, each tweet).

Result: You get a term-document matrix where each row represents a tweet, and each column represents a word from your entire vocabulary.

* ****TF-IDF:
What it does: It combines term frequency (how often a word appears in a tweet) with inverse document frequency (how unique or rare that word is across the entire dataset). Words that are very common across all tweets (e.g., “the,” “and”) will have a low score, while rarer and more significant words will have a higher score.

Result: A weighted term-document matrix where the value represents not just how often the word appears, but also how important the word is in the context of the dataset.

Which one to use?
CountVectorizer: Simple and fast. If your dataset is small and you want a quick representation of word counts, this works well.
TF-IDF: More nuanced. If you want to penalize common words and give more weight to rare or important words, go for TF-IDF. For tweets, TF-IDF tends to perform better **because tweets often have a lot of common words that don't carry much meaning.

*# **3. Regarding Your Training and Testing Strategy:*****
Since you already have separate train_df for training and test_df for validation (testing), there’s no need to split your train_df. You can directly train on train_df and then test on test_df.

Just ensure:

You use the same vectorization process (TF-IDF or CountVectorizer) for both train_df and test_df. 

In building vectors, we will implement **TF-IDF** for text and **CountVectorizer** for keywords.
> *Why? Go to the explication above*

In [12]:
from sklearn.feature_extraction.text import  CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack

#1-CountVectorizer on keyword column

#we initialize the CountVectorizer (create an object) CountVectorizer(max_features=5000)
count_vect=CountVectorizer(max_features=100000) #very important te see the doc about max_features

#fit and transform the training & test data 
train_vect_count=count_vect.fit_transform(train_df['keyword'])
test_vect_count=count_vect.transform(test_df['keyword'])

#2-TfidfVectorizer on text column

#initialization 
tfidf=TfidfVectorizer(max_features=100000)

#fit and transform 
train_vect_tf=tfidf.fit_transform(train_df['text'])
test_vect_tf=tfidf.transform(test_df['text'])

#the train_vect_tf, test_vect_tf, test_vect_count, train_vect_count are in general spare matrixes 
#that's why we imported hstak

#NOW, let's concatenate the two matrices horizontally to create a combined feature set

train_combined = hstack([train_vect_count, train_vect_tf])
test_combined = hstack([test_vect_count, test_vect_tf])

# 5. Modeling

*# 2. Modeling with RidgeClassifier**

*Why RidgeClassifier?
RidgeClassifier is a linear model that applies L2 regularization. It's similar to Logistic Regression but with a squared error loss function (which is commonly used in regression tasks).

Regularization: The L2 regularization helps prevent overfitting, which is useful when you have many features (like you would with text data).

Advantages:

It can be efficient for high-dimensional data, like text, because of its regularization.
Works well with sparse data, which is what you get after using CountVectorizer or TF-IDF.
When to use: It’s a solid choice when you have a large number of features (which happens when you vectorize text) and you want a simple, interpretable model.

Is RidgeClassifier a good idea?

Yes, it can be! RidgeClassifier works well with text data and tends to perform similarly to Logistic Regression with L2 regularization, which is often a top choice for text classification tasks like yours.
If you are looking for a simple, efficient model that can handle a large number of features (as generated by TF-IDF or CountVectorizer), RidgeClassifier is a strong candidate.

Alternatives:

Logistic Regression: Also good with text data and often used for classification. You might want to compare its performance with RidgeClassifier.
Naive Bayes (MultinomialNB): This is another common model for text classification. It can be faster than linear models and works well with both TF-IDF and CountVectorizer.

In [13]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier(class_weight='balanced',solver='lsqr',)
clf.fit(train_combined, Y)

***REMARK:***

if we use the model intuitivelly like this clf.fit(train_combined, Y) , we will get an error "cg() got an unexpected keyword argument 'tol') which is related to an incompatibility between the RidgeClassifier and the version of scipy we are using in the kaggle environment".

So, we must: 

1. upgrade scipy (not possible on this env) 
2. Use a different solver: The default solver for RidgeClassifier when working with sparse matrices is sparse_cg, which is causing this error. You can change the solver to lsqr, which works better with sparse data without triggering this error. clf = RidgeClassifier(solver='lsqr')
3. Convert the sparse matrix to a dense matrix only if the data set is not too large but this is not scalabl solution.

# 6. Testing and Validation

In [14]:
from sklearn.model_selection import cross_val_score
scores  = cross_val_score(clf, train_combined, Y, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores:", scores)
print("the mean accuracy: ", scores.mean())

Cross-validation accuracy scores: [0.69402495 0.53775443 0.60801051 0.62812089 0.66228647]
the mean accuracy:  0.6260394494233406


# 7. Testing and evaluating different models 

We will test four different models: Logistic Regression, RidgeClassifier, Naive Bayes (MultinomialNB), Random Forests.

In [15]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score #for the evaluation
import numpy as np

#List of the models initialized
models=[
    ('Logistic Regression', LogisticRegression(max_iter=1000, class_weight='balanced', solver='lbfgs')),
    ('Ridge Classifier', RidgeClassifier(class_weight='balanced', solver='lsqr')),
    ('Multinomial Naive Bayes', MultinomialNB()),
    ('Random Forest', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42))
]

# List to store the cross-validation scores
model_scores = {}

# Train and evaluate each model
for name, model in models:
    print(f"Training {name}...")
    
    # Perform cross-validation on the training data
    scores = cross_val_score(model, train_combined, Y, cv=5, scoring='accuracy')
    
    # Store the scores in the dictionary
    model_scores[name] = scores
    print(f"{name} Cross-validation accuracy scores: {scores}")
    print(f"{name} Mean accuracy: {np.mean(scores)}\n")

#let's select the best model 
best_model_name = max(model_scores, key=lambda name: np.mean(model_scores[name]))
best_model= None
for name, model in models: 
    if name==best_model_name: 
        best_model=model
        break


Training Logistic Regression...
Logistic Regression Cross-validation accuracy scores: [0.69468155 0.55022981 0.62376888 0.64520368 0.68068331]
Logistic Regression Mean accuracy: 0.6389134454354303

Training Ridge Classifier...
Ridge Classifier Cross-validation accuracy scores: [0.69402495 0.53775443 0.60801051 0.62812089 0.66228647]
Ridge Classifier Mean accuracy: 0.6260394494233406

Training Multinomial Naive Bayes...
Multinomial Naive Bayes Cross-validation accuracy scores: [0.68089297 0.54826001 0.63099146 0.65637319 0.69973719]
Multinomial Naive Bayes Mean accuracy: 0.6432509665635033

Training Random Forest...
Random Forest Cross-validation accuracy scores: [0.73342088 0.6086671  0.63230466 0.65637319 0.73390276]
Random Forest Mean accuracy: 0.6729337197574122



> **NOTE: ****From this comparision we notice that the best model is Random Forest with a Mean_accuracy=0.67**

**Graphical comparison**

In [16]:
import matplotlib.pyplot as plt
import numpy as np

# Extracting model names and their mean accuracy scores into x and y
model_names = list(model_scores.keys())
mean_scores = [np.mean(scores) for scores in model_scores.values()]

# Create a bar plot
plt.figure(figsize=(10, 6))
plt.linebar(model_names, mean_scores, color='green')
plt.xlabel('Mean Accuracy')
plt.title('Model Comparison Based on Accuracy')
plt.show()


AttributeError: module 'matplotlib.pyplot' has no attribute 'linebar'

<Figure size 1000x600 with 0 Axes>

In [None]:
# Create a box plot for cross-validation scores
plt.figure(figsize=(10, 6))
plt.boxplot(model_scores.values(), labels=model_scores.keys())
plt.ylabel('Accuracy')
plt.title('Cross-Validation Accuracy Distribution for Each Model')
plt.show()


# 8. Continu the prediction with the best model

In [None]:

#fiting the model 
best_model.fit(train_combined, Y )
#makeing prediction
Y_pred=best_model.predict(test_combined)

#we don't have the Y target we are supposed to find because it's a compitition so the correction result or the score will be knowen at the result 


#Visualization of the predicted labels where we will visualize how many tweets are predicted as real disaster tweets vs. non desaster.
unique, count = np.unique(Y_pred, return_counts=True)

plt.figure(figsize=(10,8))
plt.bar(unique, count, color=["green", "red"])
plt.xticks([0, 1], ['Non-Disaster', 'Disaster'])
plt.ylabel('Number of predictions')
plt.title("Distribution of predicted labels (Non-Disaster vs. Disaster)")
plt.show()

# 9. Submission

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission["target"] = Y_pred
sample_submission.to_csv("submission.csv", index=False)