In [None]:
#Libraries
import pandas as pd
import sqlite3
import pickle
from kafka import KafkaProducer, KafkaConsumer
import json
import random
from tqdm import tqdm
import time

import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Kafka settings
bootstrap_servers = ['localhost:9092']  # Kafka server address
topic = 'predictions_topic'  # Kafka topic to send predictions

# Function to create Kafka producer
def create_kafka_producer():
    return KafkaProducer(
        bootstrap_servers=bootstrap_servers,
        value_serializer=lambda v: json.dumps(v).encode('utf-8'), # Serialize data to JSON format
        api_version=(0, 10, 1)
    )
    
# Function to create Kafka consumer
def create_kafka_consumer():
    return KafkaConsumer(
        'predictions_topic',  # The Kafka topic to consume from
        bootstrap_servers=bootstrap_servers,
        group_id='prediction_group',  # Consumer group ID
        value_deserializer=lambda m: json.loads(m.decode('utf-8')),  # Deserialize data from JSON
        api_version=(0, 10, 1),
        auto_offset_reset='earliest', 
        enable_auto_commit=True,
        max_poll_records=500
    )

In [None]:
def load_models():
    # Load the linear regression model
    with open('happiness_model_linear.pkl', 'rb') as f:
        linear_regression_model = pickle.load(f)

    # Load the decision tree model
    with open('happiness_model_tree.pkl', 'rb') as f:
        decision_tree_model = pickle.load(f)

    return linear_regression_model, decision_tree_model

In [None]:
# Function to create SQLite database and table
def create_sqlite_db():
    conn = sqlite3.connect('predictions.db')  # Connect to SQLite database (creates the file if it doesn't exist)
    c = conn.cursor()
    c.execute('DROP TABLE IF EXISTS predictions;')
    # Create the 'predictions' table if it doesn't exist
    c.execute('''
        CREATE TABLE predictions (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            GDP_per_capita REAL,
            Life_Expectancy REAL,
            Freedom REAL,
            happiness_score REAL,
            linear_prediction REAL,
            tree_prediction REAL
        )
    ''')
    conn.commit()
    conn.close()
    print("Database and table created.")

In [None]:
def send_data_to_kafka(data, producer):
    for i in tqdm(range(len(data)), desc="Sending data to Kafka"):
        # Extract the features from each row (without the 'Happiness Score' column)
        features = data.iloc[i].drop('Happiness Score').to_dict()

        # Convert the features to JSON format to send to Kafka
        features_json = json.dumps(features)

        # Create the message to send to Kafka
        message = {
            'features': features_json  # Only the features, without predictions
        }

        # Send the message to Kafka
        producer.send('predictions_topic', value=message)
        #producer.flush()  # Ensure the message is sent immediately

        print(f"Sent to Kafka: {message}")

In [None]:
def insert_prediction_to_db(features, hs, linear_prediction, tree_prediction):
    conn = sqlite3.connect('predictions.db')  # Connect to the SQLite database
    c = conn.cursor()

    # Extract the feature values (assuming they are in a dictionary)
    gdp_per_capita = features.get('GDP per Capita', None)
    life_expectancy = features.get('Life Expectancy', None)
    freedom = features.get('Freedom', None)

    # Insert the values into the database
    c.execute('''
        INSERT INTO predictions (GDP_per_capita, Life_Expectancy, Freedom, happiness_score, linear_prediction, tree_prediction)
        VALUES (?, ?, ?, ?, ?, ?)
    ''', (gdp_per_capita, life_expectancy, freedom, hs, linear_prediction, tree_prediction))

    # Commit and close the connection
    conn.commit()
    conn.close()

    # Print the inserted data for verification
    print(f"Data inserted into database: GDP_per_capita={gdp_per_capita}, Life_Expectancy={life_expectancy}, Freedom={freedom}, Happiness Score={hs}, Linear Prediction={linear_prediction}, Tree Prediction={tree_prediction}")

In [None]:
def consume_and_store_predictions(consumer, real, linear_regression_model, decision_tree_model):
    real_values = real.tolist()

    index = 0

    print('ok')
    for message in tqdm(consumer, desc="Processing messages"):
        data = message.value
        print(f"Data: {data}")
        # Extract the Happiness Score (hs)
        hs = real_values[index]
        print(f"HS: {hs}")
        
        # Extract the features from the received message
        features = json.loads(data['features'])
        
        # Make predictions using the models
        linear_prediction = linear_regression_model.predict([list(features.values())])[0]
        tree_prediction = decision_tree_model.predict([list(features.values())])[0]

        # Insert the results into the database
        insert_prediction_to_db(features, hs, linear_prediction, tree_prediction)

        print(f"Data consumed and stored: {data}")

        index += 1

In [None]:
def main():
    # Create the database and table
    create_sqlite_db()

    # Load the models
    linear_regression_model, decision_tree_model = load_models()

    # Create a Kafka producer
    producer = create_kafka_producer()

    # Load dataset
    data_test = pd.read_csv('data_test.csv')

    # Send data to Kafka and store it in DB
    send_data_to_kafka(data_test, producer)

    # Create a Kafka consumer
    consumer = create_kafka_consumer()

    # Consume the data, make the predictions and store them in the database
    consume_and_store_predictions(consumer, data_test['Happiness Score'], linear_regression_model, decision_tree_model)

if __name__ == '__main__':
    main()

In [None]:
# Connect to SQLite database
conn = sqlite3.connect('predictions.db')
# Load data from the database
query = "SELECT happiness_score, linear_prediction, tree_prediction FROM predictions"
df = pd.read_sql(query, conn)

# True values (happiness_score) and predicted values (linear_prediction, tree_prediction)
y_true = df['happiness_score']
y_pred_linear = df['linear_prediction']
y_pred_tree = df['tree_prediction']

# Calculate evaluation metrics for both models
mae_linear = mean_absolute_error(y_true, y_pred_linear)
mse_linear = mean_squared_error(y_true, y_pred_linear)
rmse_linear = mean_squared_error(y_true, y_pred_linear, squared=False)
r2_linear = r2_score(y_true, y_pred_linear)

mae_tree = mean_absolute_error(y_true, y_pred_tree)
mse_tree = mean_squared_error(y_true, y_pred_tree)
rmse_tree = mean_squared_error(y_true, y_pred_tree, squared=False)
r2_tree = r2_score(y_true, y_pred_tree)

# Display evaluation metrics
print("\nLinear Regression Model Evaluation Metrics:")
print(f"Mean Absolute Error (MAE): {mae_linear}")
print(f"Mean Squared Error (MSE): {mse_linear}")
print(f"Root Mean Squared Error (RMSE): {rmse_linear}")
print(f"R-squared (R2): {r2_linear}")

print("\nDecision Tree Model Evaluation Metrics:")
print(f"Mean Absolute Error (MAE): {mae_tree}")
print(f"Mean Squared Error (MSE): {mse_tree}")
print(f"Root Mean Squared Error (RMSE): {rmse_tree}")
print(f"R-squared (R2): {r2_tree}")

# Plot the actual vs predicted values
plt.figure(figsize=(14, 6))

# Linear regression
plt.subplot(1, 2, 1)
plt.scatter(y_true, y_pred_linear, color='blue', alpha=0.5)
plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], color='red', linestyle='--')
plt.title('Linear Regression: Real vs Predicted')
plt.xlabel('Real Happiness Score')
plt.ylabel('Predicted Happiness Score')
plt.text(
    0.05, 0.85, 
    f'MSE: {mse_linear:.2f}\nMAE: {mae_linear:.2f}\nR2: {r2_linear:.2f}', 
    transform=plt.gca().transAxes, 
    fontsize=10, 
    bbox=dict(facecolor='white', alpha=0.7)
)

# Decision tree
plt.subplot(1, 2, 2)
plt.scatter(y_true, y_pred_tree, color='green', alpha=0.5)
plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], color='red', linestyle='--')
plt.title('Decision Tree: Real vs Predicted')
plt.xlabel('Real Happiness Score')
plt.ylabel('Predicted Happiness Score')
plt.text(
    0.05, 0.85, 
    f'MSE: {mse_tree:.2f}\nMAE: {mae_tree:.2f}\nR2: {r2_tree:.2f}', 
    transform=plt.gca().transAxes, 
    fontsize=10, 
    bbox=dict(facecolor='white', alpha=0.7)
)

plt.tight_layout()
plt.show()