In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [2]:
# creating a sample dataset
training_data = pd.read_csv('train_file.csv')
training_data['Source'].replace('^\s+$', np.nan,regex=True,inplace=True)
training_data['Source'].replace('[-_]', '',regex=True,inplace=True)
training_data['Source'].replace('[^\x00-\x7f]', np.nan,regex=True,inplace=True)
training_data = training_data.dropna()
training_data['Source'].isna().sum()

source = list(training_data['Source'])
source_without_spaces = [(re.sub(r'[^\w]', ' ', x)).replace(' ', '') for x in source]

data = {
    'headline':list(training_data['Headline']),
    'source': source_without_spaces,
    'topic': list(training_data['Topic']),
    'facebook': list(training_data['Facebook']),
    'linkedin': list(training_data['LinkedIn']),
    'instagram': list(training_data['GooglePlus']),
    'target': list(training_data['SentimentHeadline'])
}


df = pd.DataFrame(data)

In [3]:
# Separate input features and target
X = df.drop('target', axis=1)
y = df['target']

# Splitting the dataset into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [4]:
# Preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for headline feature
headline_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer())
])

In [5]:
# Bundle preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['facebook', 'linkedin', 'instagram']),
        ('cat', categorical_transformer, ['source', 'topic']),
        ('headline', headline_transformer, 'headline')
    ])

In [6]:
# Defining the base model
base_model = RandomForestRegressor(n_estimators=100, random_state=42)

In [7]:
# Defining the stacking model
stacking_model = StackingRegressor(
    estimators=[
        ('lr', LinearRegression()),
        ('rf', RandomForestRegressor(n_estimators=10, random_state=42))
    ],
    final_estimator=RandomForestRegressor(n_estimators=100, random_state=42)
)

In [8]:
# Defining the full pipeline with preprocessing and the stacked model
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('stacked_model', stacking_model)
])

# Fitting the pipeline on training data
full_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['facebook', 'linkedin',
                                                   'instagram']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['source', 'topic']),
                                                 ('headline',
                                                  Pipeline(steps=[('tfidf',
                                                                   TfidfVectorizer())]),
                                                  'headline')

In [9]:
# Defining the full pipeline with preprocessing and the stacked model
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('stacked_model', stacking_model)
])

# Fitting the pipeline on training data
full_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['facebook', 'linkedin',
                                                   'instagram']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['source', 'topic']),
                                                 ('headline',
                                                  Pipeline(steps=[('tfidf',
                                                                   TfidfVectorizer())]),
                                                  'headline')

In [None]:
# Predicting on test data
y_pred = full_pipeline.predict(X_test)

In [None]:
# Checking model performance on test data
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")