In [None]:
pip install pandas scikit-learn joblib



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Training Model


In [None]:
# Load the train and dev datasets
train_data = pd.read_csv("/content/drive/MyDrive/Colab/train.csv")

# Fill NaN values with an empty string
train_data.fillna('', inplace=True)

# Combine both texts into a single feature for simplicity
train_data['text'] = train_data['text_1'] + ' ' + train_data['text_2']

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Colab/train.csv'

In [None]:
# Prepare training, validation, and trial datasets
X_train = train_data['text']
y_train = train_data['label']

In [None]:
# Feature extraction: TF-IDF
vectorizer = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1)
X_train_tfidf = vectorizer.fit_transform(X_train)

In [None]:
# Model: Logistic Regression
model = LogisticRegression(random_state=42)
model.fit(X_train_tfidf, y_train)
joblib.dump(model, '/content/drive/MyDrive/Colab/logistic_regression_model.pkl')

# Evaluation testing data

In [None]:
# load model
model = joblib.load('/content/drive/MyDrive/Colab/logistic_regression_model.pkl')

In [None]:
# Load the train and dev datasets
dev_data = pd.read_csv('/content/drive/MyDrive/Colab/dev.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab/test.csv')
trial_data = pd.read_csv('/content/drive/MyDrive/Colab/AV_trial.csv')

In [None]:
# Fill NaN values with an empty string
dev_data.fillna('', inplace=True)
test_data.fillna('', inplace=True)
trial_data.fillna('', inplace=True)

In [None]:
# Combine both texts into a single feature for simplicity
dev_data['text'] = dev_data['text_1'] + ' ' + dev_data['text_2']

test_data['text'] = test_data['text_1'] + ' ' + test_data['text_2']

trial_data['text'] = trial_data['text_1'] + ' ' + trial_data['text_2']

In [None]:
# Prepare validation, trial and test datasets

X_dev = dev_data['text']
y_dev = dev_data['label']

X_trial = trial_data['text']
y_trial = trial_data['label']

X_test = test_data['text']

In [None]:
# Feature extraction: TF-IDF
X_dev_tfidf = vectorizer.transform(X_dev)
X_test_tfidf = vectorizer.transform(X_test)
X_trial_tfidf = vectorizer.transform(X_trial)

In [None]:
dev_predictions = model.predict(X_dev_tfidf)

In [None]:
trial_predictions = model.predict(X_trial_tfidf)

In [None]:
# Evaluation
print("Accuracy:", accuracy_score(y_dev, dev_predictions))
print("Classification Report:")
print(classification_report(y_dev, dev_predictions))

Accuracy: 0.5605
Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.59      0.57      2989
           1       0.57      0.54      0.55      3011

    accuracy                           0.56      6000
   macro avg       0.56      0.56      0.56      6000
weighted avg       0.56      0.56      0.56      6000



In [None]:
# Evaluation
print("Accuracy:", accuracy_score(y_trial, trial_predictions))
print("Classification Report:")
print(classification_report(y_trial, trial_predictions))

Accuracy: 0.84
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.84      0.84        25
           1       0.84      0.84      0.84        25

    accuracy                           0.84        50
   macro avg       0.84      0.84      0.84        50
weighted avg       0.84      0.84      0.84        50



# Model Demo


In [None]:
pip install pandas scikit-learn joblib

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [None]:
# Put your model here
model_path = ""
model = model = joblib.load(model_path)

In [None]:
def model_pre(data_path):

  # Load data
  data = pd.read_csv(data_path)

  # Fill NaN values with an empty string
  data.fillna('', inplace=True)

  # Combine both texts into a single feature for simplicity
  data['text'] = data['text_1'] + ' ' + data['text_2']

  # Prepare validation, trial and test datasets

  X_data = data['text']
  y_data = data['label']

  # Feature extraction: TF-IDF
  X_data_tfidf = vectorizer.transform(X_data)

  data_predictions = model.predict(X_data_tfidf)

  predictions_df = pd.DataFrame(data_predictions, columns=['prediction'])

  # Save the DataFrame to a CSV file
  predictions_df.to_csv('Group_78_A.csv', index=False)

In [None]:
# Put your testing data path here
data_path = ""
model_pre(data_path)