In [1]:
# https://github.com/johanna23cct/integrated-CA2-MSc-2023094.git

In [2]:
pip install skforecast --user

Note: you may need to restart the kernel to use updated packages.


In [14]:
# Data manipulation
# ==============================================================================
import numpy as np
import pandas as pd
import datetime

# Plots
# ==============================================================================
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.rcParams['lines.linewidth'] = 1.5
%matplotlib inline

# Modeling and Forecasting
# ==============================================================================
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom
#from skforecast.ForecasterAutoregMultiOutput import ForecasterAutoregMultiOutput
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster

from joblib import dump, load

# Warnings configuration
# ==============================================================================
import warnings
# warnings.filterwarnings('ignore')

In [15]:
#dataset
data = pd.read_csv('ProjectTweets.csv')

In [18]:
# display the total number of rows data
total_rows = data.shape[0]
formatted_total_rows = "{:,}".format(total_rows)
print("Total Rows:", formatted_total_rows)

Total Rows: 1,599,999


In [5]:
data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,2,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,3,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,4,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,5,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [6]:
# Put title to the dataset
# *************************************************************
data = [
    [0, 1467810369, "Mon Apr 06 22:19:45 PDT 2009", "NO_QUERY", "_TheSpecialOne_", "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"],
    # Add more rows here...
]

# rename (Titles)
columns = ["n", "ids", "date", "flag", "user", "text"]

# Create a DataFrame with the data and column names
df = pd.DataFrame(data, columns=columns)

In [7]:
df.head(10)

Unnamed: 0,n,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."


In [8]:
print(df.date)

0    Mon Apr 06 22:19:45 PDT 2009
Name: date, dtype: object


In [9]:
# Date string 
date_string = df.date   #"Mon Apr 06 22:19:45 PDT 2009"
print(date_string)

0    Mon Apr 06 22:19:45 PDT 2009
Name: date, dtype: object


In [10]:
date_string = "Mon Apr 06 22:19:45 PDT 2009"
date = date_string.split()[1],date_string.split()[2],date_string.split()[5]
date

('Apr', '06', '2009')

In [11]:
date_string = "Mon Apr 06 22:19:45 PDT 2009"
hour = date_string.split()[3]
hour

'22:19:45'

In [12]:
# loop through each date string  and  split it

date_string = df['date']

for date_str in date_string:
    date_components = date_str.split()
    
    if len(date_components) >= 6:
        date = (date_components[1], date_components[2], date_components[5])
        hour = date_components[3]
        print("Date", date)
        print("Hour:", hour)
    else:
        print("Invalid date string format:", date_str)



Date ('Apr', '06', '2009')
Hour: 22:19:45


In [19]:
print(f'Number of rows with missing values: {data.isnull().any(axis=1).mean()}')

Number of rows with missing values: 0.0


In [20]:
# Verify that a temporary index is complete
# ==============================================================================
(data.index == pd.date_range(start=data.index.min(),
                             end=data.index.max(),
                             freq=data.index.freq)).all()

AttributeError: 'RangeIndex' object has no attribute 'freq'

In [None]:
# Data preparation
# ==============================================================================

df = df.set_index('date')
df = df.resample('M').asfreq()
df = df.sort_index()
data.head()