Vectorize

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

human_csv = pd.read_csv('r1_r2_annotations_liwc_h.csv')
human_csv = human_csv[human_csv['majority_vote'] != 'NoMajority']
text = human_csv['proc_text']


max_features = 30
vectorizer = TfidfVectorizer( max_features=max_features)
X = vectorizer.fit_transform(text)

In [74]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Assuming you have labels for your data
y = human_csv['majority_vote']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestClassifier using the TF-IDF transformed features
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Evaluate the classifier
accuracy = rf_classifier.score(X_test, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.6901408450704225


Dump vectorizer + model

In [75]:
import joblib
joblib.dump(rf_classifier, 'rf_classifier_model.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')


['tfidf_vectorizer.joblib']

Model

In [76]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

class ModelPredictor:
    def __init__(self, model, vectorizer):
        self.model = model
        self.vectorizer = vectorizer
    
    def predict_from_excel(self, excel_file):
        df = pd.read_excel(excel_file, header=0)
        display(df.columns)
        
        predictions = []
        
        for index, row in df.iterrows():
            text = row['text_for_predictions']  # Replace 'text_column' with the actual name of the column containing text data
            
            # Vectorize the text
            X_test = self.vectorizer.transform([text])
            
            prediction = self.model.predict(X_test)
            
            predictions.append(prediction[0]) 
            
        return predictions


In [77]:
# Load the trained model and vectorizer
rf_classifier = joblib.load('rf_classifier_model.joblib')
vectorizer = joblib.load('tfidf_vectorizer.joblib')

model_predictor = ModelPredictor(rf_classifier, vectorizer)

excel_test = 'example_to_predict.xlsx'

predictions = model_predictor.predict_from_excel(excel_test)

print(predictions)

Index(['text_for_predictions'], dtype='object')

['UN', 'UN', 'PO']
