In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pickle

In [3]:
# Read the data into a pandas dataframe
data = pd.read_csv('data/old-data.csv')

# Split the data into features and target
X = data[["user_query", "snippet"]]
y = data["source"]

# Preprocess the text data using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X.apply(lambda x: ' '.join(x), axis=1))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model on the testing data
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Save the model
np.save('model.npy', model)
np.save('vectorizer.npy', vectorizer)



              precision    recall  f1-score   support

          NR       0.81      0.98      0.89       132
           R       0.83      0.25      0.38        40

    accuracy                           0.81       172
   macro avg       0.82      0.62      0.64       172
weighted avg       0.82      0.81      0.77       172



In [30]:
# Save the trained CountVectorizer and LogisticRegression objects to a file
with open('model.pkl', 'wb') as f:
    pickle.dump((vectorizer, model), f)

# Load the trained CountVectorizer and LogisticRegression objects from a file
with open('model.pkl', 'rb') as f:
    vectorizer, model = pickle.load(f)
    
# Define a function to predict the relevance of a source given a user query and snippet
def predict_relevance(model, vectorizer, query, snippet):
    # Preprocess the input data
    input_data = [query + ' ' + snippet]
    input_data = vectorizer.transform(input_data)
    
    # Make a prediction using the trained model
    prediction = model.predict(input_data)
    
    # Return the prediction (1 for relevant and 0 for not relevant)
    return prediction[0]

# Test the model with a relevant input
query1 = "What year did the Capulin Volcano in New Mexico Last errupt"
snippet1 = "Capulin Volcano National Monument is located in northeastern New Mexico near the town of Raton. Designated a national monument in 1916, it contains a portion of the Raton-Clayton Volcanic Field, in particular a prime example of an extinct cinder cone volcano known as Capulin."
prediction1 = predict_relevance(model, vectorizer, query1, snippet1)
print(f"Prediction for input 1: {prediction1} (1=Relevant, 0=Not relevant)")

# Test the model with a non-relevant input
query2 = "Roosevelt Campobello Parkâ€™s cameo in Ken Burns documentary ..."
snippet2 = "The park has another reason to celebrate: the debut of a new 14-hour PBS documentary by acclaimed filmmaker Ken Burns, â€œThe Roosevelts: An Intimate History,â€ a 14-hour, seven-part film set to ..."
prediction2 = predict_relevance(model, vectorizer, query2, snippet2)
print(f"Prediction for input 2: {prediction2} (1=Relevant, 0=Not relevant)")

Prediction for input 1: R (1=Relevant, 0=Not relevant)
Prediction for input 2: R (1=Relevant, 0=Not relevant)
