In [None]:
import pandas as pd
from datetime import datetime

# Assuming your data is loaded into a dataframe called 'data'
data = pd.read_csv('data_copy.csv')

# Extract year
data['year'] = pd.to_datetime(data['period_id'], format='%Y%m%d').dt.year

# Extract month
data['month'] = pd.to_datetime(data['period_id'], format='%Y%m%d').dt.month_name()  # Month name (e.g., January)

# Extract day of month
data['day_of_month'] = pd.to_datetime(data['period_id'], format='%Y%m%d').dt.day

# Extract day of week (optional)
data['day_of_week'] = pd.to_datetime(data['period_id'], format='%Y%m%d').dt.strftime('%A')  # Day of week (e.g., Monday)

# You can add similar logic to extract other features like quarter or week of year (if needed)
data = data.sort_values(by='period_id')
# Select rows where 'teleservice_cat' is 'VOICE'
voice_data = data[data['teleservice_cat'] == 'VOICE']


# Select rows where 'teleservice_cat' is either 'VOICE' or 'INTERNATIONAL'
#voice_data = data[data['teleservice_cat'].isin(['VOICE', 'OPERATOR_1

print(f"Nulls Count Before: {voice_data.isnull().sum(axis=0)}")
print(f"NaN Count Before: {voice_data.isna().sum()}")

voice_data_not_transformed = voice_data.ffill()
print("End")

# Calculate z-scores
z_scores = (voice_data - voice_data.mean()) / voice_data.std()
# Identify outliers based on z-score threshold (e.g., +/- 3)
outliers = voice_data[abs(z_scores) > 3]

# Replace outliers with the mean
handled_outliers = voice_data.where(~(abs(z_scores) > 3), voice_data.mean())

model_data = np.log1p(voice_data+ 1)

print(f"Nulls Count: {model_data.isnull().sum(axis=0)}")
print(f"NaN Count: {model_data.isna().sum()}")



#Define hyperparameter ranges
p_values = [1, 2, 3]  # Adjust as needed
d_values = [0, 1]  # Adjust as needed
q_values = [0, 1, 2]  # Adjust as needed

# Create time series cross-validation object
tscv = TimeSeriesSplit(n_splits=5)  # Adjust n_splits as needed

# Initialize variables for best model tracking
best_score = float('inf')
best_model = None
best_order = None

# Perform hyperparameter tuning and cross-validation with exogenous variables
for p in p_values:
    for d in d_values:
        for q in q_values:
            order = (p, d, q)
            try:
                cv_scores = []
                cv_scores2 = []
                cv_scores3 = []

                for train_index, test_index in tscv.split(filled_data):
                    train, validation = model_data.iloc[train_index], model_data.iloc[test_index]

                    # Extract exogenous variables
                    exog_train = train[['month', 'day_of_month', 'day_of_week']] 
                    exog_validation = validation[['month', 'day_of_month', 'day_of_week']]

                    # Include exogenous variables in ARIMA model
                    model_with_exog = ARIMA(train['total_duration'], order=order, exog=exog_train)
                    model_fit_exog = model_with_exog.fit()

                    # Make predictions
                    predictions = model_fit_exog.predict(start=len(train), end=len(validation), exog=exog_validation)

                    # Calculate evaluation metrics
                    rmse = sqrt(mean_squared_error(validation['total_duration'], predictions))
                    mae = mean_absolute_error(validation['total_duration'], predictions)
                    mse = mean_squared_error(validation['total_duration'], predictions)

                    print(f"RMSE: {rmse}")
                    print(f"MAE: {mae}")
                    print(f"MSE: {mse}")

                    cv_scores.append(rmse)
                    cv_scores2.append(mae)
                    cv_scores3.append(mse)

                mean_cv_score = np.mean(cv_scores)

                # Update best model if a better performing model is found
                if mean_cv_score < best_score:
                    best_score, best_model = mean_cv_score, model_with_exog
                    best_order = order

            except Exception as e:
                print(f"Error fitting model: {e}")
                continue

# Use the best-performing model for final forecasting with exogenous variables
#final_model = ARIMA(filled_data['total_duration'], order=best_order, exog=filled_data[['month', 'day_of_month', 'day_of_week']])  # Replace with your feature names
#final_model_fit = final_model.fit()
#predictions = final_model_fit.forecast(steps=len(validation))  # Replace validation with your desired forecast horizon
