In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from joblib import dump, load

# Download and set up stopwords
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/quhaowen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Text preprocessing
def preprocess_text(text, stopwords=STOPWORDS):
    # Ensure the text is of string type
    text = str(text)
    # Remove non-alphabetical characters
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()
    words = text.split()
    # Remove stopwords
    filtered_words = [word for word in words if word not in stopwords]
    text = ' '.join(filtered_words)
    return text


In [4]:
train_df = pd.read_csv('train.csv')
dev_df = pd.read_csv('dev.csv')

# text preprocessing
train_df['Processed_Claim'] = train_df['Claim'].apply(preprocess_text)
train_df['Processed_Evidence'] = train_df['Evidence'].apply(preprocess_text)
dev_df['Processed_Claim'] = dev_df['Claim'].apply(preprocess_text)
dev_df['Processed_Evidence'] = dev_df['Evidence'].apply(preprocess_text)

# Combine processed text
train_df['Text'] = train_df['Processed_Claim'] + " " + train_df['Processed_Evidence']
dev_df['Text'] = dev_df['Processed_Claim'] + " " + dev_df['Processed_Evidence']


In [9]:
# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 3), max_df=0.75, min_df=5)

# Vectorize the text
X_train = tfidf.fit_transform(train_df['Text']).toarray()
y_train = train_df['label'].values
X_dev = tfidf.transform(dev_df['Text']).toarray()
y_dev = dev_df['label'].values


In [10]:
# Split the training set and test set
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# train the model
class_weights ='balanced'
rf_classifier = RandomForestClassifier(n_estimators=100, class_weight=class_weights, random_state=42)
rf_classifier.fit(X_train_split, y_train_split)

# Save the model to a file
dump(rf_classifier, 'rf_classifier.joblib')

# Predict and evaluate on the development set
predictions = rf_classifier.predict(X_test_split)
print(classification_report(y_test_split, predictions))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      3484
           1       0.65      0.56      0.61      1257

    accuracy                           0.80      4741
   macro avg       0.75      0.73      0.74      4741
weighted avg       0.80      0.80      0.80      4741



In [11]:
# Load the model
rf_classifier = load('rf_classifier.joblib')

# Save predictions for dev
dev_predictions = rf_classifier.predict(X_dev)
dev_df['prediction'] = dev_predictions
dev_df[['prediction']].to_csv('dev_predictions.csv', index=False)

In [12]:
# Load the model
rf_classifier = load('rf_classifier.joblib')

# Load test data
test_df = pd.read_csv('test.csv')

# text preprocessing
test_df['Processed_Claim'] = test_df['Claim'].apply(preprocess_text)
test_df['Processed_Evidence'] = test_df['Evidence'].apply(preprocess_text)
test_df['Text'] = test_df['Processed_Claim'] + " " + test_df['Processed_Evidence']

# Vectorize the text
X_test = tfidf.transform(test_df['Text']).toarray()

# Generate predictions
test_predictions = rf_classifier.predict(X_test)

test_df['prediction'] = test_predictions
test_df[['prediction']].to_csv('Group_33_A.csv', index=False)