<a href="https://colab.research.google.com/github/MedGhassenBouallegue/Text-Minning/blob/main/DetectionOfComplaintTweetsModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
df_complaint = pd.read_csv('complaint1700.csv')
df_noncomplaint = pd.read_csv('noncomplaint1700.csv')
print(df_complaint.head())
print(df_noncomplaint.head())
df_test=pd.read_csv('test_data.csv')





       id    airline                                              tweet
0   80938     United  @united I'm having issues. Yesterday I rebooke...
1   10959     United  @united kinda feel like the $6.99 you charge f...
2  130813  SouthWest  Livid in Vegas, delayed, again&amp; again&amp;...
3  146589     United  @united the most annoying man on earth is on m...
4  117579     United  @united The last 2 weeks I've flown wit u, you...
     id    airline                                              tweet
0   404     United  @brianfadem @united The best summertime soap o...
1   706  SouthWest  @aresef @united yes the change fees are cheape...
2   882  SouthWest  @SouthwestAir Do you guys not fly from Birming...
3  1196     United  This mornings @united #flight seems to be on t...
4  1244    JetBlue  @JetBlue @Boston_Calling I have never been to ...


In [None]:
# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

In [None]:
# Choose either stemming or lemmatization
def stem_and_lemmatize(text):
    words = text.split()
    # Uncomment one of the following lines to choose stemming or lemmatization
    # words = [stemmer.stem(word) for word in words if word not in stop_words]  # Stemming
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization
    return ' '.join(words)

In [None]:
print(df_noncomplaint['tweet'])

0       @brianfadem @united The best summertime soap o...
1       @aresef @united yes the change fees are cheape...
2       @SouthwestAir Do you guys not fly from Birming...
3       This mornings @united #flight seems to be on t...
4       @JetBlue @Boston_Calling I have never been to ...
                              ...                        
1695    @SubTheGamer @united completely pathetic! I'm ...
1696    @QuranWeekly @united  Too many bigots in Ameri...
1697    @SangyeH @united I would too. My sister conver...
1698    Ã¢â‚¬Å“@hindukid1021: prime example of our wor...
1699    @united @ImamSuhaibWebb you call that a statem...
Name: tweet, Length: 1700, dtype: object


In [None]:
#clean text
df_noncomplaint['cleaned_tweet'] = df_noncomplaint['tweet'].apply(clean_text)
stop_words = set(stopwords.words('english'))

df_noncomplaint['cleaned_tweet'] = df_noncomplaint['cleaned_tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
# Check cleaned text
#print(df_noncomplaint[['tweet', 'cleaned_tweet']].head())
df_noncomplaint['processed_tweet'] = df_noncomplaint['cleaned_tweet'].apply(stem_and_lemmatize)




#clean text
df_complaint['cleaned_tweet'] = df_complaint['tweet'].apply(clean_text)

df_complaint['cleaned_tweet'] = df_complaint['cleaned_tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
# Check cleaned text
#print(df_complaint[['tweet', 'cleaned_tweet']].head())

df_complaint['processed_tweet'] = df_complaint['cleaned_tweet'].apply(stem_and_lemmatize)


#print(df_noncomplaint[['tweet', 'processed_tweet']].head())
#print(df_complaint[['tweet', 'processed_tweet']].head())

df_test['cleaned_tweet'] = df_test['tweet'].apply(clean_text)

df_test['cleaned_tweet'] = df_test['cleaned_tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df_test['processed_tweet'] = df_test['cleaned_tweet'].apply(stem_and_lemmatize)
print(df_test[['tweet', 'processed_tweet']].head())




                                               tweet  \
0  @SouthwestAir get your damn act together. Don'...   
1  @AmericanAir horrible at responding to emails....   
2  @AmericanAir hey where is your crew? Flight aa...   
3         Ok come on we are late let's goooo @united   
4  @AmericanAir since you are now affiliated with...   

                                     processed_tweet  
0  southwestair get damn act together dont announ...  
1  americanair horrible responding email ive sent...  
2  americanair hey crew flight aa im going miss f...  
3                      ok come late let goooo united  
4  americanair since affiliated usairways wanted ...  


Extract Features Using the TF-IDF Model

In [None]:
df_complaint['label'] = 1
df_noncomplaint['label'] = 0
df = pd.concat([df_noncomplaint, df_complaint], ignore_index=True)

print(df.head(2000))





          id    airline                                              tweet  \
0        404     United  @brianfadem @united The best summertime soap o...   
1        706  SouthWest  @aresef @united yes the change fees are cheape...   
2        882  SouthWest  @SouthwestAir Do you guys not fly from Birming...   
3       1196     United  This mornings @united #flight seems to be on t...   
4       1244    JetBlue  @JetBlue @Boston_Calling I have never been to ...   
...      ...        ...                                                ...   
1995   35219     United  @united Yeah, but now I'm waiting for a 3 hour...   
1996   11793     United  @united When is a cancelled flight not cancelled?   
1997  170083   American  Longg morning @dfwairport. Runway closed bec d...   
1998   38205      Delta  @DeltaAssist @Delta I made the orig. reservati...   
1999   58369  SouthWest  .@SouthwestAir Your Wifi is terrible. http://t...   

                                          cleaned_tweet  \
0   

In [None]:
x=df['processed_tweet']
y=df['label']
x_test = df_test['processed_tweet']



tfidf = TfidfVectorizer(max_features=5000)

x_tfidf=tfidf.fit_transform(x)

x_test_tfidf = tfidf.fit_transform(x_test)

print(x_tfidf.shape)

print(x_test_tfidf.shape)



(3400, 5000)
(4555, 5000)


In [None]:
nb = MultinomialNB()
nb.fit(x_tfidf, y)

#df_test->cleaning->tfidf_transform
y_pred = nb.predict(x_test_tfidf)

df_test['predicted_label'] = y_pred

print(df_test[['tweet', 'predicted_label']].head(10))


df_test['label'] = 1
y_test = df_test['label']

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Display classification report
print(classification_report(y_test, y_pred))


                                               tweet  predicted_label
0  @SouthwestAir get your damn act together. Don'...                0
1  @AmericanAir horrible at responding to emails....                1
2  @AmericanAir hey where is your crew? Flight aa...                1
3         Ok come on we are late let's goooo @united                0
4  @AmericanAir since you are now affiliated with...                1
5  @IIJERiiCHOII @VirginAmerica what the fuck is ...                0
6  @SouthwestAir your customer service sucks! You...                0
7  Rudest, most condescending customer service re...                0
8  @SouthwestAir flight 195 delayed. So much for ...                1
9              @UtdArif @AmericanAir this is so shit                1
Accuracy: 37.28%
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.37      0.54      4555

    accuracy                           0.37      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
