In [7]:
# Step 1: Import Necessary Libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Load the Datasets
print("Loading datasets...")
train_df = pd.read_csv('train.tsv', sep='\t')
valid_df = pd.read_csv('valid.tsv', sep='\t')
test_df = pd.read_csv('test.tsv', sep='\t')

print("Datasets loaded successfully.\n")

# Step 3: Data Preprocessing
print("Preview of training dataset:")
print(train_df.head())
print("\nTraining dataset info:")
print(train_df.info())

print("\nPreview of validation dataset:")
print(valid_df.head())
print("\nValidation dataset info:")
print(valid_df.info())

print("\nPreview of testing dataset:")
print(test_df.head())
print("\nTesting dataset info:")
print(test_df.info())

# Step 4: Check Dataset Columns before Concatenation
print("\nColumns in training dataset:")
print(train_df.columns)

print("\nColumns in validation dataset:")
print(valid_df.columns)

# Step 5: Combine train and validation sets for training
print("\nCombining training and validation datasets...")
df = pd.concat([train_df, valid_df], ignore_index=True)
print("Combined dataset info:")
print(df.info())

# Step 6: Verify 'statement' and 'label' Column Existence in Combined Dataset
print("\nColumns in combined dataset:")
print(df.columns)

if 'statement' not in df.columns:
    raise KeyError("The 'statement' column is not present in the combined dataset.")
if 'label' not in df.columns:
    raise KeyError("The 'label' column is not present in the combined dataset.")

# Step 7: Data Cleaning (if needed)
# Drop rows with missing values
df = df.dropna()

# Step 8: Vectorization with TfidfVectorizer
print("\nVectorizing text data...")
X_train = df['statement']
y_train = df['label']
X_test = test_df['statement']
y_test = test_df['label']

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)
print("Text data vectorized.\n")

# Step 9: Training the PassiveAggressiveClassifier
print("Training the PassiveAggressiveClassifier...")
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)
print("Model trained successfully.\n")

# Step 10: Model Evaluation
print("Evaluating the model...")
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {score * 100:.2f}%')

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# Step 11: Save the Model and Vectorizer
print("\nSaving the model and vectorizer...")
with open('model.pkl', 'wb') as model_file:
    pickle.dump(pac, model_file)

with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf_vectorizer, vectorizer_file)
print("Model and vectorizer saved.\n")

# Step 12: Visualizations
# Visualize Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

# Visualize Distribution of Labels
plt.figure(figsize=(6, 4))
df['label'].value_counts().plot(kind='bar', color=['skyblue', 'salmon', 'green', 'yellow', 'purple', 'blue'])
plt.xlabel('Labels')
plt.ylabel('Count')
plt.title('Distribution of Labels')
plt.xticks(rotation=45)
plt.show()


Loading datasets...
Datasets loaded successfully.

Preview of training dataset:
    2635.json        false  \
0  10540.json    half-true   
1    324.json  mostly-true   
2   1123.json        false   
3   9028.json    half-true   
4  12465.json         true   

  Says the Annies List political group supports third-trimester abortions on demand.  \
0  When did the decline of coal start? It started...                                   
1  Hillary Clinton agrees with John McCain "by vo...                                   
2  Health care reform legislation is likely to ma...                                   
3  The economic turnaround started at the end of ...                                   
4  The Chicago Bears have had more starting quart...                                   

                             abortion    dwayne-bohac  \
0  energy,history,job-accomplishments  scott-surovell   
1                      foreign-policy    barack-obama   
2                         health-care  

KeyError: "The 'statement' column is not present in the combined dataset."