In [2]:
import pandas as pd

# Load 'twcs' from 'raw' file
df = pd.read_csv(r'C:\Users\User\projects\ai_support_assistant\data\raw\twcs.csv')

In [None]:
# filter for 'inbund' == False tweets
outbound_df = df[df['inbound'] == False]

In [None]:
company_names = outbound_df['author_id'].unique()

In [None]:
import pandas as pd

# unique company screen names from 'author_id' where inbound == False
company_names = outbound_df['author_id'].unique()

In [None]:
import re
from collections import Counter

# @-mentions from 'text' where inbound == True
inbound_df = df[df['inbound'] == True]
mentions = []
for text in inbound_df['text']:
    mentions.extend(re.findall(r'@\w+', text))  # Extract @mentions

# Remove @ prefix from mentions
mentions = [mention.lstrip('@') for mention in mentions]

# Count the frequency of mentions that are in the company_names list
mention_counts = Counter([mention for mention in mentions if mention in company_names])

# Get the top 10 mentioned company names
target_companies_list = [mention for mention, _ in mention_counts.most_common(10)]

print("\nTarget Companies List:")
print(target_companies_list)


Target Companies List:
['AmazonHelp', 'AppleSupport', 'AmericanAir', 'Uber_Support', 'Delta', 'VirginTrains', 'SouthwestAir', 'Tesco', 'SpotifyCares', 'British_Airways']


In [None]:
# Function to extract the first valid @-mention from the text that is in target_companies_list
def extract_company_label(text, target_companies):
    mentions = re.findall(r'@\w+', text)  # Extract all @mentions
    for mention in mentions:
        mention_cleaned = mention.lstrip('@')  # Remove @ prefix
        if mention_cleaned in target_companies:
            return mention_cleaned
    return None

# Create a new column for cleaned text by removing the extracted company label
def clean_text(text, label):
    if label:
        return re.sub(rf'@{label}', '', text).strip()  # Remove the @mention from the text
    return text

In [None]:
# Apply the functions to create a new columns
inbound_df['extracted_company_label'] = inbound_df['text'].apply(lambda x: extract_company_label(x, target_companies_list)) 
inbound_df['cleaned_text_for_feature'] = inbound_df.apply(
    lambda row: clean_text(row['text'], row['extracted_company_label']), axis=1

In [None]:
# Filter out rows where extracted_company_label is null
filtered_df = inbound_df[inbound_df['extracted_company_label'].notnull()]

In [None]:
# download nltk for text cleaning
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
# Text cleaning function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a single string
    return ' '.join(tokens)

In [None]:
# Apply text cleaning to the 'cleaned_text_for_feature' column
filtered_df['cleaned_text_for_feature'] = filtered_df['cleaned_text_for_feature'].apply(clean_text_for_model)

In [None]:
# Encode the 'extracted_company_label' column using LabelEncoder
label_encoder = LabelEncoder()
filtered_df['encoded_company_label'] = label_encoder.fit_transform(filtered_df['extracted_company_label'])

In [None]:
### TF-IDF and model training
##splitting data
from sklearn.model_selection import train_test_split
# predict company(for now, in the future i will change to category) from clean text
X = filtered_df['cleaned_text_for_feature']
y = filtered_df['encoded_company_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')

# Fit the training
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Train a MultinomialNB model
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred = nb_model.predict(X_test_tfidf)

In [None]:
# Train a LogisticRegression model
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
y_pred = lr_model.predict(X_test_tfidf)

In [None]:
# Evaluating the model
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
# save for later use
import joblib
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl') #tfidf
joblib.dump(label_encoder, 'label_encoder.pkl') #encoder
joblib.dump(nb_model, 'multinomial_nb_model.pkl') # multinomial model
joblib.dump(lr_model, 'logistic_regression_model.pkl') # logistic regression model