# Importing Libraries

In [6]:
import warnings
warnings.simplefilter("ignore")

import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
import calendar
import dash
from dash import dcc, html, dcc, callback_context
import dash_core_components as dcc
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output
import dash_html_components as html
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import itertools
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.tsa.api as sm
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from textblob import TextBlob
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from wordcloud import WordCloud
from collections import Counter
from gensim.summarization import summarize

ModuleNotFoundError: No module named 'gensim.summarization'

# 

# Importing Dataset
#### This dataset will be used for complete Data Visualisation task and  for the Question 1 (Time Series) from Machine Learning for Business.

In [None]:
df = pd.read_csv('weather.csv')
df.head(5)

#### For better understanding of the dataset, I'm providing terminology explanation, to be aware what each weather parameter represents:

datetime: Date and time of the weather data.

tempmax: Maximum temperature recorded.

tempmin: Minimum temperature recorded.

temp: Current temperature.

feelslikemax: Maximum "feels like" temperature.

feelslikemin: Minimum "feels like" temperature.

feelslike: Current "feels like" temperature.

dew: Dew point temperature.

humidity: Relative humidity percentage.

precip: Precipitation amount.

precipprob: Probability of precipitation.

precipcover: Extent of precipitation coverage.

preciptype: Type of precipitation (e.g., rain, snow).

snow: Snowfall amount.

snowdepth: Snow depth on the ground.

windgust: Maximum wind gust.

windspeed: Current wind speed.

winddir: Wind direction.

sealevelpressure: Atmospheric pressure at sea level.

cloudcover: Percentage of sky covered by clouds.

visibility: Visibility range in meters.

solarradiation: Solar radiation amount.

solarenergy: Solar energy captured or available.

uvindex: UV index.

severerisk: Risk of severe weather conditions.

sunrise: Time of sunrise.

sunset: Time of sunset.

moonphase: Current phase of the moon.

conditions: Summary of weather conditions.

description: Detailed description of weather conditions.

icon: Icon representing weather conditions.

stations: Information about weather monitoring stations or locations.

In [None]:
print(df.info())

# 

# Data Preparation

In [None]:
columns_with_nan = df.columns[df.isnull().any()].tolist()

nan_count_per_column = df[columns_with_nan].isnull().sum()

print("Columns with NaN values and their respective counts:")
print(nan_count_per_column)

In [None]:
columns_with_nan = df.columns[df.isnull().any()].tolist()

for column in columns_with_nan:
    unique_values = df[column].unique()
    print(f"Column '{column}':")
    print(unique_values)
    print("---------------------------------")

# 

I looked for the unique values within columns that contain NaN values, to see if missing value is just matter of mistake or it indicates that value for that column was not available in the moment of recording, since it's weather data. 

1. precitype (Precipitation Type) It has 1003 NaN values. Missing values here suggest that the type of precipitation (rain, snow etc.) is not specified or availabe at the moment of recording. It might happen due to not having rain or snow at that moment at all. However, this column doesn't bring much value for my dataset and will drop it. 


2. snowdepth (Snow Depth): There is only 1 NaN value in this column. Snow depth typically represents the depth of accumulated snow at a specific location or time. One missing value indicates that snow depth data was not added as we have 0 within unique values. I will replace NaN value by using Interpolation.


3. windgust (Wind Gust): This column contains 254 missing values. Wind gust refers to the maximum wind speed observed over a brief period. The absence of values in this column for 254 observations signifies that wind gust data is not available for these instances, so I will replace thos NaN values by using Interpolation. 


4. sealevelpressure (Sea Level Pressure): There is 1 missing value in this column. Sea level pressure represents atmospheric pressure adjusted to sea level. One missing value implies that sea level pressure data was simply not added that day. I will replace NaN value using Interpolation. 


5. severerisk (Risk of Severe Weather Conditions): This column has 1374 missing values. Severerisk represents the risk level associated with severe weather conditions. The high count of missing values (1374) indicates that the risk level for severe weather was not applicable for these recordings, therefore NaN values as there was no risk level. In this situation, I will replace NaN values with 0 by using Imputation. 


6. stations: There are 6 missing values in the 'stations' column. This column might contain identifiers or codes for weather stations. The missing values suggest that the station identifiers are not added for these observations. Since this column doesn't bring any value to my dataset, as there are no longitude and altitude for plotting, I will just remove it from dataset. 

# 

### Removing NaN Values

#### Droppping columns

In [None]:
columns_to_drop = ['precip', 'precipprob', 'precipcover', 'preciptype', 'stations']
df.drop(columns=columns_to_drop, inplace=True)

#### Interpolation

In [None]:
df['sealevelpressure'].interpolate(method='linear', inplace=True)
df['windgust'].interpolate(method='linear', inplace=True)
df['snowdepth'].interpolate(method='linear', inplace=True)

#### Imputation

In [None]:
df['severerisk'].fillna(0, inplace=True)

#### Re-checking dataset

In [None]:
nan_values = df.isnull().sum()
nan_values

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'])
df['sunrise'] = pd.to_datetime(df['sunrise'])
df['sunset'] = pd.to_datetime(df['sunset'])
df.info()

# 

# EDA

#### Data Distribution

In [None]:
columns_for_histogram = ['tempmax', 'tempmin', 'temp', 'humidity', 'windgust', 'uvindex']

plt.figure(figsize=(12, 8))
for i, col in enumerate(columns_for_histogram, 1):
    plt.subplot(2, 3, i)
    plt.hist(df[col], bins=20, alpha=0.7, color='blue', edgecolor='black')
    plt.title(col + " Distribution")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.grid(axis='y')

plt.tight_layout()
plt.show()

In [None]:
columns_for_histogram = ['tempmax', 'tempmin', 'temp', 'humidity', 'windgust', 'uvindex']
fig = px.histogram(df, x=columns_for_histogram, marginal="rug", title="Distribution of Weather Parameters")
fig.update_layout(bargap=0.3)  
fig.show()

As we can see from the distribution plots, most common 'tempmax' is around 30, which indicates that frequently occuring maximum temperature recorded is around 30 degrees Celsius. Most common minimum temperature is around 22C and most common temperature is around 25.

Most common humidity is in range between 70-75%, most common wind gusts are 15 to 20 km/h and most common UV index values are 8 and 9, which represent quite hing exposure. 

Based on these information, weather conditions in this area could be described as moderately warm, with moderate humidity and wind gusts. This would imply comfortable climate with temperatures typically not too hot or cold, along with moderate humidity levels and occasional breezy conditions. We can see on minimum temperature that it barely goes under 0C. Also, due to high UV index, it would be advisable to take precautions when UV index is in most common range (8-9).


About the plot:

I created a histogram to visualise the distributions of numerical variables in my dataset. For this plot, I used probably the most relevant ones: 'tempmax', 'tempmin', 'temp', 'humidity' and 'windgust'. 

I used 'blue' color for better visibility and aesthetic appeal in visualisations. I also added 'edgecolor=black' to define histogram bars edges for better distinction and readability. Also, as per feedback from CA1, I added Grid Lines for better readability of values and visual interpreting the distribution of data. In the plot under, I created a interactive version static plot, for dashboard. 

#### Correlation Analysis

In [None]:
continuous_columns = ['temp', 'humidity', 'windgust', 'uvindex', 'severerisk']
pairplot_data = df[continuous_columns]

fig = px.scatter_matrix(pairplot_data, dimensions=continuous_columns, title='Correlation between Continuous Variables')

fig.update_traces(diagonal_visible=False)  
fig.update_layout(width=1000, height=800)  
fig.show()

Correlations between columns:

1. Logic correclation comes between UV index and temperature - higher UV index is noted on higher temperatures, but also, most of the high UV index values are noted between 20 and 30C. Higher severe risk was noted only on higher temperatures. There is no specific correlation really between temperature and wind gust or humidity, except that wind gusts and humidity mostly show up when temperature is above 0C.

2. Humidity doesn't have much impact on UV index, but high UV index appears mostly when humidity is less than 80%. High severity risks are appearing mostly on higher humidity levels. 

3. Wind gusts have no much correlation with other factors. High severe risks usually show up on wind gusts between 25-50 km/h.

#### Categorical Variables Analysis

In [None]:
icon_counts = df['icon'].value_counts().reset_index()
icon_counts.columns = ['Icon', 'Count']  # Rename the columns for Plotly compatibility

fig = px.bar(
    icon_counts,
    x='Icon',
    y='Count',
    labels={'Icon': 'Icon', 'Count': 'Count'},
    title='Distribution of Weather Icons'
)

fig.update_layout(
    xaxis={'tickangle': 45},
    yaxis={'title': 'Count'},
    height=500,
    width=800
)

fig.show()

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'])

fig = px.scatter(df, x='datetime', y='icon', labels={'icon': 'Icon', 'datetime': 'Date'},
                 title='Type of Weather for Each Day', hover_data={'datetime': '|%Y-%m-%d'})

fig.update_layout(
    xaxis={'title': 'Date'},
    yaxis={'title': 'Icon'},
    height=400,  # Adjusted height for dashboard fitting
    width=700,   # Adjusted width for dashboard fitting
    template='plotly_white',  # Lighter theme for better dashboard integration
    margin=dict(l=40, r=40, t=40, b=40),  # Adjusted margins for better space utilization
    font=dict(family="Arial", size=12),  # Font adjustments for better readability
)

fig.show()

Categorical variables were 'conditions', 'description' and 'icon'. I decided to plot 'icon' as that column represents how the weather was that day and it represents target variable as well. Description is just explaining weather conditions that day and it contains too much unique text to be plotted. 

As we can see, even though the weather was explained as moderate warm and humid, we can see that most common weather tag was rain.

Again, I picked this plot to be interactive and added another interactive plot (to plot 'icon' against date, to get the type of weather for each day) for dashboard creating and added Grid Lines for better visibility and readbility. 

#### Relationship between Continuous Variables

In [None]:
continuous_columns = ['temp', 'humidity', 'windspeed', 'sealevelpressure', 'uvindex', 'severerisk', 'dew', 'visibility']
corr_matrix = df[continuous_columns].corr()

zmax = max(corr_matrix.values.max(), -corr_matrix.values.min())

heatmap = go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    colorscale='RdBu',  
    zmin=-zmax,
    zmax=zmax,
    colorbar=dict(title='Correlation'),
    hoverongaps=False
)

layout = go.Layout(
    title='Correlation Heatmap of Continuous Variables',
    xaxis=dict(title='Variables'),
    yaxis=dict(title='Variables'),
)

fig = go.Figure(data=[heatmap], layout=layout)

fig.show()

As we can see, columns with higher positive correlation are temperature, dew, visibility and UV Index in combination with each other. 


I used coolwarm as color map and I was using cool colors (blue shades) for negative correlations and warm colors (red shades) for positive correlations is intuitive for most people, as it represents contrast in familiar way. Also, the stronger correlation is (or opposite), the color is darker. 

#### Temperature Variation

In [None]:
temperature_columns = ['tempmax', 'tempmin', 'temp', 'feelslikemax', 'feelslikemin', 'feelslike']

box_traces = []
for column in temperature_columns:
    box_trace = go.Box(
        y=df[column],
        name=column,
        boxmean='sd'  
    )
    box_traces.append(box_trace)

layout = go.Layout(
    title='Temperature Variation',
    yaxis=dict(title='Temperature (°C)'),
    xaxis=dict(title='Temperature Columns'),
    showlegend=True
)

fig = go.Figure(data=box_traces, layout=layout)

fig.show()

This plot is showing us how temperatures vary. From what we can see, it looks that "Feels like" temperatures are quite aligned with actual temperatures, especially with max and min temperature ('feelslike' has higher span than 'temp'). 

There are no outliers. 


I picked this plot as it enables to compare temperature-related variables in a single plot. This helps in understanding the range and distribution of different temperature-related measurements. It also provides key statistics which helps assesing central tendency and skewness of the data. 

#### Time-related Analysis

In [None]:
# Convert 'datetime' column to datetime type
df['datetime'] = pd.to_datetime(df['datetime'])

# Create dropdown options for selecting variables
dropdown_options = [{'label': col, 'value': col} for col in df.columns if col in [
    'tempmax', 'tempmin', 'temp', 'dew', 'humidity', 'snow', 'snowdepth',
    'windgust', 'windspeed', 'sealevelpressure', 'cloudcover', 'visibility',
    'solarradiation', 'solarenergy', 'uvindex', 'severerisk']
]

# Create a figure with 'temp' data as default
fig = go.Figure(go.Scatter(x=df['datetime'], y=df['temp'], mode='lines'))

# Update layout to include dropdown menu
fig.update_layout(
    title='Weather Trends over Time',    
    updatemenus=[
        {
            'buttons': [
                {
                    'label': var,
                    'method': 'update',
                    'args': [{'y': [df[var]]}],
                } for var in df.columns if var in [
                    'tempmax', 'tempmin', 'temp', 'dew', 'humidity', 'snow', 'snowdepth',
                    'windgust', 'windspeed', 'sealevelpressure', 'cloudcover', 'visibility',
                    'solarradiation', 'solarenergy', 'uvindex', 'severerisk']
            ],
            'direction': 'down',
            'showactive': True,
            'x': 0.01,
            'xanchor': 'left',
            'y': 1.1,
            'yanchor': 'top'
        }
    ]
)

# Display the initial figure with 'temp' data
fig.show()

I picked interactive plot with zoom funcionality due to large number of inputs, which wouldn't be readable in static plot. Users can zoom in and out for specific areas of interest, for closer examination of patterns or anomalies. 
Also, it provides ability to focus on smaller time intervals without losing the context of the entire dataset. I decided to add dropdown option, to give option to pick between most important weather variables.
Added Grid and blue color for better readability.

#### Mean data per year compared to overall mean

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'])

df['year'] = df['datetime'].dt.year

columns_of_interest = ['temp', 'humidity', 'snow', 'windspeed', 'uvindex']

overall_mean = df[columns_of_interest].mean()

mean_by_year = df.groupby('year')[columns_of_interest].mean()

In [None]:
for col in columns_of_interest:
    plt.figure(figsize=(8, 6))

    plt.axhline(y=overall_mean[col], color='red', linestyle='-', label='Overall Mean')

    bars = plt.bar(mean_by_year.index, mean_by_year[col], alpha=0.7, label=f'{col} Mean')
    plt.title(f'{col} Mean Values per Year compared to Overall Mean')
    plt.xlabel('Year')
    plt.ylabel('Mean Value')
    plt.legend(loc='lower left')
    plt.xticks(rotation=45)
    plt.grid(axis='y')

    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, yval, round(yval, 2), va='bottom')

    plt.tight_layout()
    plt.show()

As we can see from these dashboards, temperature was the only one that was showing some sort of jump compared to overall mean. Mean temperature for 2018 was 2C higher than mean and almost 4C higher than mean of 2021 and 2022. 

Other than that, we can see there was no much snow in these years, only in 2022, which was one of the colder years.


I used this plot to compare means values per year to the overall mean. It provides clear visual representation of how each zear's meand value for different variables compares to respective overall mean. It allows an easy comparison between the mean values and overall one. It also helps to identify if certain years were significantly different from others or from overall mean. 

I included Drid Lines and numeric values on top of the bars for better clarity and readability.

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'])

df['month'] = df['datetime'].dt.month

mean_temp = df.groupby(['year', 'month'])['temp'].mean().reset_index()

pivot_mean_temp = mean_temp.pivot(index='month', columns='year', values='temp')
pivot_mean_temp = pivot_mean_temp.rename(index=lambda x: calendar.month_abbr[x])

fig = go.Figure(data=go.Heatmap(
    z=pivot_mean_temp.values.tolist(),
    x=pivot_mean_temp.columns,
    y=pivot_mean_temp.index,
    colorscale='YlOrRd', 
    zmin=pivot_mean_temp.min().min(), 
    zmax=pivot_mean_temp.max().max(), 
    colorbar=dict(title='Mean Temperature'),
    zhoverformat='.1f'
))

fig.update_layout(
    title='Mean Temperature per Month for Each Year',
    xaxis=dict(title='Year'),
    yaxis=dict(title='Month')
)

fig.show()

I created this plot to get more in details temperature data for plot above this one. 

I used this type of plot as it effectively displays a matrix of data using variations in color to represent values. It's excellent for showcasing patterns in matrix-like datasets. By using color gradients, the heatmap makes it easy to interpret and compare mean temperature values. This color scheme helps to quickly grasp the variations in temperature across months and years, it's visible on the first sight which days were warmer than others.

# 

# Creating Interactive Dashboard

### Layout

In [None]:
# Ovaj je dobar
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

app.layout = html.Div([
    html.Div([
        html.Div([
            html.H1("Overview", style={'font-size': '26px','text-align': 'center'}),
            dcc.Markdown('''
            The dataset, sourced from [Kaggle](https://www.kaggle.com/datasets/rukenmissonnier/weather-in-istanbul/data), 
            presents an extensive collection of weather parameters specific to Istanbul, 
            collected between April 7, 2018, and September 27, 2023. 
            With 2000 entries and 32 columns, it covers various weather attributes such as temperature, humidity, precipitation, 
            wind characteristics, and more. Link to local address for better visiblity: http://127.0.0.1:8050/
            '''),

            html.H3("Insights from Exploratory Data Analysis (EDA)", style={'font-size': '20px', 'text-align': 'center'}),
            dcc.Markdown('''
            ###### Temperature Analysis
            - **tempmax & tempmin**: Most commonly observed maximum temperature is approximately 30°C, while the typical minimum temperature hovers around 22°C. The average temperature observed falls close to 25°C.
            - **temp**: The frequently recorded temperature tends to be around 25°C.
            ###### Humidity and Wind Analysis
            - **humidity**: The prevalent humidity levels are within the range of 70-75%.
            - **windgust & windspeed**: Commonly recorded wind gusts range between 15 to 20 km/h.
            ###### UV Index and Weather Conditions
            - **uvindex**: The predominant UV index values are 8 and 9, indicating substantial exposure to UV radiation.
            - **conditions & description**: The prevailing weather conditions suggest a moderately warm climate, with moderate humidity and occasional breezy conditions. The temperatures typically maintain a comfortable range without extreme hot or cold spells. Notably, the minimum temperature rarely drops below 0°C.
            ###### Precautionary Measures
            Given the fact we have a high count of the higher UV index values (8-10), it is advisable to take precautions against sun exposure during periods when the UV index is at its most common levels.
            ''')
        ], className='section'),
    ], className='col-6'),

        html.Div([
            html.Div([
                html.H1("Distribution of Weather Parameters", style={'font-size': '24px', 'text-align': 'center'}),
                dcc.Dropdown(
                    id='weather-params-dropdown',
                    options=[
                        {'label': 'Temperature', 'value': 'temp'},
                        {'label': 'Humidity', 'value': 'humidity'},
                        {'label': 'Wind Gust', 'value': 'windgust'},
                        {'label': 'UV Index', 'value': 'uvindex'},
                        {'label': 'Max Temperature', 'value': 'tempmax'},
                        {'label': 'Min Temperature', 'value': 'tempmin'}
                ],
                value='temp'  # Default value when the app starts
            ),
            dcc.Graph(id='weather-params-graph')
        ], className='section')
    ], className='col-6'),

    html.Div([
        html.Div([
            html.H1("Mean Temperature per Month for Each Year", style={'font-size': '22px', 'text-align': 'center'}),
            dcc.Dropdown(
                id='year-dropdown',
                options=[{'label': str(year), 'value': year} for year in df['year'].unique()],
                value=df['year'].min(),  # Default value when the app starts
                clearable=False,  # Disable option to clear the selection
                style={'width': '50%'}  # Adjust width of the dropdown
            ),
            dcc.Dropdown(
                id='month-dropdown',
                options=[{'label': calendar.month_abbr[month], 'value': month} for month in range(1, 13)],
                value=8,  # Default value when the app starts (January)
                clearable=False,  # Disable option to clear the selection
                style={'width': '50%'}  # Adjust width of the dropdown
            ),
            dcc.Graph(id='mean-temp-graph')
        ], className='section')
    ], className='col-6'),

    html.Div([
        html.Div([
            html.H1("Correlation Heatmap of Continuous Variables", style={'font-size': '22px', 'text-align': 'center'}),
            dcc.Dropdown(
                id='corr-vars-dropdown-1',
                options=[{'label': col, 'value': col} for col in df.columns],
                value=['temp', 'humidity', 'snow', 'windspeed'],  # Default values when the app starts
                multi=True  # Allow multiple selections
            ),
            dcc.Graph(id='corr-heatmap-2')
        ], className='section'),
    ], className='col-6'),

    html.Div([
        html.Div([
            html.H1("Weather Trends over Time", style={'font-size': '24px', 'text-align': 'center'}),
            dcc.Dropdown(
                id='weather-params-dropdown2',
                options=[
                    {'label': col, 'value': col} for col in [
                        'tempmax', 'tempmin', 'temp', 'dew', 'humidity', 'snow', 'snowdepth',
                        'windgust', 'windspeed', 'sealevelpressure', 'cloudcover', 'visibility',
                        'solarradiation', 'solarenergy', 'uvindex', 'severerisk']
                ],
                value='temp'  # Default value when the app starts
            ),
            dcc.Graph(id='weather-trends-graph', figure=fig)  # Your plot ID and initial figure
        ], className='section'),
    ], className='col-6'),

    html.Div([
        html.Div([
            html.H1("Type of Weather for Each Day", style={'font-size': '24px', 'text-align': 'center'}),
            dcc.Graph(id='weather-type-graph')  # Your plot ID
        ], className='section'),
    ], className='col-6')
], className='row')

# Callbacks

# Callback for weather parameters graph
@app.callback(
    Output('weather-params-graph', 'figure'),
    [Input('weather-params-dropdown', 'value')]
)
def update_weather_params_graph(selected_param):
    if selected_param is None:
        default_param = 'temp'  # Set the default parameter to 'temp'
        columns_for_histogram = [default_param]
    else:
        columns_for_histogram = [selected_param]

    fig = px.histogram(df, x=columns_for_histogram, marginal="rug", title=f"Distribution of {columns_for_histogram[0].capitalize()}")
    fig.update_layout(
        bargap=0.3,
        width=800,
        template='plotly_white',
        height=600,
        margin=dict(l=40, r=40, t=40, b=40),
    )
    return fig
    

# Callback for mean temperature
@app.callback(
    Output('mean-temp-graph', 'figure'),  # Assuming 'mean-temp-graph' is the ID of your heatmap plot
    [Input('year-dropdown', 'value'),
     Input('month-dropdown', 'value')]
)
def update_mean_temp_graph(selected_year, selected_month):
    # Logic to update the heatmap plot based on selected year and month
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['month'] = df['datetime'].dt.month

    mean_temp = df.groupby(['year', 'month'])['temp'].mean().reset_index()
    pivot_mean_temp = mean_temp.pivot(index='month', columns='year', values='temp')
    pivot_mean_temp = pivot_mean_temp.rename(index=lambda x: calendar.month_abbr[x])

    selected_temp = pivot_mean_temp[selected_year]
    selected_month_str = calendar.month_abbr[selected_month]

    fig = go.Figure(go.Heatmap(
        z=pivot_mean_temp.values.tolist(),
        x=pivot_mean_temp.columns,
        y=pivot_mean_temp.index,
        colorscale='YlOrRd',
        zmin=pivot_mean_temp.min().min(),
        zmax=pivot_mean_temp.max().max(),
        colorbar=dict(title='Mean Temperature'),
        zhoverformat='.1f'
    ))
    fig.update_layout(
        title=f'Mean Temperature for {selected_month_str} {selected_year}',
        template='plotly_white',
        xaxis=dict(title='Year'),
        yaxis=dict(title='Month'),
        annotations=[
            dict(
                x=selected_year,
                y=pivot_mean_temp.index.get_loc(selected_month_str),  # Getting index location of selected month
                xref="x",
                yref="y",
                text=f"Mean Temp: {selected_temp.loc[selected_month_str]:.1f}",
                showarrow=True,
                arrowhead=7,
                ax=0,
                ay=-40
            )
        ]
    )
    return fig

# Callback for corr-heatmap-2
@app.callback(
    Output('corr-heatmap-2', 'figure'),  # Assuming 'corr-heatmap-2' is the ID of your heatmap
    [Input('corr-vars-dropdown-1', 'value')]
)
def update_corr_heatmap(selected_variables):
    corr_matrix = df[selected_variables].corr()

    zmax = max(corr_matrix.values.max(), -corr_matrix.values.min())

    heatmap = go.Heatmap(
        z=corr_matrix.values,
        x=corr_matrix.columns,
        y=corr_matrix.columns,
        colorscale='RdBu',
        zmin=-zmax,
        zmax=zmax,
        colorbar=dict(title='Correlation'),
        hoverongaps=False
    )

    layout = go.Layout(
        template='plotly_white',
        xaxis=dict(title='Variables'),
        yaxis=dict(title='Variables'),
    )

    fig = go.Figure(data=[heatmap], layout=layout)

    return fig

# Callback for weather-trends-graph
# Callback for updating the weather trends graph
@app.callback(
    Output('weather-trends-graph', 'figure'),
    [Input('weather-params-dropdown2', 'value')]
)
def update_weather_trends_graph(selected_param):
    # Logic to update the figure based on the selected parameter

    # Assuming df is your DataFrame and 'datetime' is the datetime column
    updated_fig = go.Figure(go.Scatter(x=df['datetime'], y=df[selected_param], mode='lines'))

    # Further customization of the updated_fig based on selected_param...
    updated_fig.update_layout(
        title=f"Weather Trends for {selected_param}",
        template='plotly_white',
        xaxis=dict(title='Date'),
        yaxis=dict(title=selected_param)
    )

    return updated_fig

# Callback for updating the "Type of Weather for Each Day" plot
@app.callback(
    Output('weather-type-graph', 'figure'),
    [Input('weather-params-dropdown2', 'value')]
)
def update_weather_type_graph(selected_param):
    # Assuming df is your DataFrame and 'datetime' is the datetime column
    # Replace 'icon' with your actual column name for weather type
    df['datetime'] = pd.to_datetime(df['datetime'])
    fig = px.scatter(df, x='datetime', y='icon', labels={'icon': 'Icon', 'datetime': 'Date'})
    
    fig.update_layout(
        xaxis={'title': 'Date'},
        yaxis={'title': 'Icon'},
        height=400,
        width=700,
        template='plotly_white',
        margin=dict(l=40, r=40, t=40, b=40),
        font=dict(family="Arial", size=12),
    )

    return fig

if __name__ == '__main__':
    app.run_server(debug=True, dev_tools_ui=True)

As per assignment, we were supposed to create a interactive dashboard which will include 3 rows and 2 columns, making total of 6 sections. When it comes to visualisation compontents, in my workbook, I used Heatmap, Line Chart, Histograms, Bar Chart, Stacked Bar Chart, Boxplot and Scatter plots to visualise my dataset. Also, I used Interactive features like dropdowns and interactive plots.

My dashboard followed those instructions, it consist of 6 sections (3 rows and 2 columns). First section is overview of the dataset (text section), while other 5 are interactive plots (4 of them with dropdowns, one without.)

1. "Overview" is a text section, it's giving all the relevant on the dataset I picked, as well as, link to it's source. Also, sums up all the conclusions I made by performing the EDA. 


2. "Distribution of Weather Parameters" is histogram, created to give insights on distribution of the weather parameters. I used this type of the plot as it's ideal to display the frequency of weather variables like temperature, humidity etc and it helps to recognise patterns and which values are most frequent. It also helps to understand ranges of weather conditions. I also included dropdown with variables, so users can pick variables that interest them.


3. "Mean Temperature per Month for Each Year" is a heatmap type of plot and it displays mean temperature variations across on month level, for 5 years. I picked this type of plot as it's perfect for illustrating how mean temperature varies over time, helping to identify seasonal patterns. It also shows variations effectively, allowing viewer to compare temperatures across different time period. I used colorscale='YlOrRd' as it makes visible to bare eye which periods were warmer and which ones were cooloer by the darker and lighter colors. This plot is also interactive, with dropdown allowing audience to pick a year and month of the interest.


4. "Correlation Heatmap of Continuous Variables" is a Heatmap (Correlation Matrix) type of plot. I picked this type of plot as it shows relationship between continuous variables in simple and understandable way. It's ideal for understanding the strengt of correlations between weather parameters like temperature, humidity, wind speed etc. Also, it helps to identify which variables are strongly correlated and by that, impacting each other. By having this plot, audience will be able, for example, to see strong correlation between temperature and UVIndex. I picked this colorscale as it clearly visible that variables with darker colors are in stronger correlation than ones in lighter colors. Stronger correlation is, the color is darker and vice-versa. Also, I made this plot interactive and added dropdown as well, with list of most important variables so audience can pick variables of their interest. 


5. "Weather Trends over Time" is time-series plot, Line Chart type. Purpose of this plot is to show trends or patterns over a continuous time period. It's really useful for visualising how weather parameters (eg. temperature, humidity etc) change or trend over time. It also helps audience to identify patterns, seasonality or long-term trends in the weather data. This plot is interactive as well, so audience can zoom in specific period of interest, but also they can pick from dropdown parameter of their interest, to see it's continous sequence and to percieve trends and changes more intuitively. 


6. "Type of Weather for Each Day" is categorical time series plot. I picked this plot to give audience more insights on what kind of weather was recorded each day, in case they want to compare other weather parameters with actual weather for that specific day. I used this type of plot as it's great for presenting categorical data like weather conditions (e.g. sunny, rainy, cloudy etc.) observed daily. It also helps to understand the distribution and frequency of each weather condition over the time. This plot was also interactive, due to high number of inputs, to make it readable. 


These plots together offer a comprehensive view of the weather dataset, covering distribution, trends over time, relationships between variables and correlations among continuous variables, ensuring a holistic understanding for audience. The selection of plots still effectively covers various asprects of the weather dataset, providing audience with multiple perspectives to explore and understand the data's nuances.


Choice of colors: I used blue color for bars, dots and lines as it's percieved as calm color, it's easy on eyes and can create sense of comfort for viewers, making it pleasant to look at for extended periods. In cases I had more options to pick at same time, I had to bring in other colors as well, for better visibility, but blue was main color in dashboard plots. Also, I was adding Grid Lines, as per instructions from previous assignment again, for better visibility, as well as using white background. 

# 

# Machine Learning for Business

## Time series analysis


Times series analysis involves analyzing data points collected or recorded at specific time intervals. It is used for forecasting, understanding underlying patterns and making predictions based on the time-dependent data. In machine learning, various models like ARMA, ARIMA, SARIMA etc. are applied to time series for prediction. 


Time series used on my specific dataset: 

Temporal Patterns and Trends:
Time series helps in uncovering patterns, trends, and seasonality within the data. It allows us to understand how weather attributes like temperature, humidity, wind characteristics, etc. change over time. 

Forecasting and Predictions:
Time series models are used to forecast future values based on historical patterns. For instance, predicting future temperature trends or the likelihood of certain weather conditions occurring based on past data.

Detecting Anomalies or Outliers:
Analyzing time series data helps in identifying unusual or anomalous events. Sudden spikes in temperature, unexpected weather patterns, or irregularities can be detected through time series analysis.

### 

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'])

In [None]:
plt.figure(figsize=(12, 6))
df["windspeed"].plot()

### ADF Test

##### Purpose of Augmented Dickey-Fuller Test:
The ADF test is used to determine wheather a given time series is stationary or not. 
Stationarity is a crucial assumption in time series analysis, and the ADF test helps in confirming or rejecting the presence of a unit root in the data. A unit root suggests non-stationarity. 

Stationarity refers to a time series where statistical properties, like mean, variance, and autocorrelation, remain constant over time. In simpler terms, it doesn't exhibit long-term trends, and its statistical properties don't change with time.

Stationarity is vital in time series analysis because many forecasting models assume it. It simplifies analysis, making predictions more reliable. Non-stationary data might give misleading insights or forecasts, making modeling more challenging.


I will perform ADF test on 'windspeed' column.

In [None]:
result = adfuller(df['windspeed'], autolag='AIC')

print('ADF Statistic:', result[0])
print('p-value:', result[1])
print('Critical Values:')
for key, value in result[4].items():
    print(f'   {key}: {value}')

#### Results Analysis: 

ADF Statistic: The ADF Statistic value is -16.03. This value is significantly lower (more negative) than the critical values at 1%, 5%, and 10% levels, indicating strong evidence against the null hypothesis. This suggests that the 'windspeed' time series is stationary.

p-value: The p-value of approximately 6.06e-29 is well below the typical significance levels (e.g., 0.05), providing further evidence against the null hypothesis of non-stationarity. The low p-value indicates a high level of confidence in rejecting the null hypothesis in favor of stationarity.

Critical Values: All critical values at 1%, 5%, and 10% levels are more negative than the ADF Statistic, confirming the statistical significance of the test results and supporting the stationarity of the 'windspeed' time series.

With the ADF test indicating that the 'windspeed' time series is stationary, it suggests that there might not be a need for differencing or other transformations to make the data stationary before proceeding with time series modeling or analysis. This stationary characteristic simplifies the modeling process and makes it more suitable for various time series modeling techniques without requiring additional adjustments for stationarity.

### EDA 

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(df['datetime'], df['windspeed'], color='blue')
plt.title('Windspeed Time Series for 2022')
plt.xlabel('Date')
plt.ylabel('Windspeed')
plt.grid(True)
plt.show()

### Identification of Parameters

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(211)
plot_acf(df['windspeed'], lags=40, ax=plt.gca(), color='blue')
plt.title('Autocorrelation Function (ACF)')

plt.subplot(212)
plot_pacf(df['windspeed'], lags=40, ax=plt.gca(), color='blue')
plt.title('Partial Autocorrelation Function (PACF)')

plt.tight_layout()
plt.show()

### Model Selection


I decided to go with ARIMA based on observed characteristics of the windspeed time series data, particulary from ACF and PACF plot, along with some previous assumptions:

1. ACF and PACF Patterns: The significant spikes observed in the PACF plot at the first two lags and the ACF plot indicating a slow decay in autocorrelation suggest a potential autoregressive (AR) behavior at these lags. This pattern aligns with the characteristics that ARIMA models aim to capture.

2. Stationarity: As per the earlier analysis using the Augmented Dickey-Fuller (ADF) test, the 'windspeed' series was found to be stationary. ARIMA models are applicable to stationary time series data, and stationary series tend to exhibit consistent behavior over time, making them suitable for ARIMA modeling.

3. Time Dependency: ARIMA models are designed to capture time dependencies in data, especially those with observed autocorrelation patterns (as seen in ACF and PACF plots).

4. Forecasting: Since assignment requires one-step-ahead forecasting, ARIMA is used in short forcastings. 

### Finding the Best Parameters


As my dataset is already stationary, I might not need to include differencing in the model. 

In [None]:
aic_vals = []
for p in range(5):
    for q in range(5):
        model = sm.ARIMA(df['windspeed'], order=(p, 0, q))
        try:
            fitted_model = model.fit()
            aic = fitted_model.aic
            aic_vals.append([aic, p, q])
        except:
            continue

aic_vals.sort()

print(aic_vals)

### Splitting the Dataset

In [None]:
train_size = int(len(df) * 0.8)

In [None]:
train_data, validation_data = df.iloc[:train_size], df.iloc[train_size:]

print("Training set size:", len(train_data))
print("Validation set size:", len(validation_data))

In [None]:
adfuller(train_data["windspeed"])

In [None]:
plt.figure(figsize= [12, 5])
plt.plot(train_data["windspeed"])

In [None]:
plot_acf(train_data["windspeed"])

In [None]:
plot_pacf(train_data["windspeed"])

### Fitting ARIMA model

Parameters: p, d, q = 2, 0, 0

In [None]:
p, d, q = 4, 0, 4  
arima_model = sm.ARIMA(train_data['windspeed'], order=(p, d, q))
fitted_model = arima_model.fit()
fitted_model.summary()

### Further Investigation

In [None]:
fitted_model.plot_diagnostics(figsize=(15, 12))

Parameters: p, d, q = 1, 0, 4

In [None]:
p, d, q = 1, 0, 4  
arima_model = sm.ARIMA(train_data['windspeed'], order=(p, d, q))
fitted_model2 = arima_model.fit()
fitted_model2.summary()

### Further Investigation

In [None]:
fitted_model2.plot_diagnostics(figsize=(15, 12))

### Forecast

In [None]:
actual_values = df['windspeed'].tail(10)

forecasted_values_df = pd.DataFrame()

for i in range(10):
    fitted_model = sm.ARIMA(pd.concat([df['windspeed'].head(len(df) - 10 + i), actual_values.head(i + 1)]), order=(4, 0, 4)).fit()
    
    next10_values = fitted_model.forecast(steps=10)
    
    next10_values.index = range(actual_values.index[-1] + 1, actual_values.index[-1] + 11)
    
    forecasted_values_df = pd.concat([forecasted_values_df, next10_values])

forecasted_values_df.reset_index(drop=True, inplace=True)

In [None]:
last_value = df['windspeed'].tail(1)

fitted_model = sm.ARIMA(df['windspeed'], order=(4, 0, 4)).fit() 

forecast_value = fitted_model.forecast(steps=1)
forecast_value.index = [actual_values.index[-1] + 1]

### Forcasted values

In [None]:
print("Last 10 actual values:")
print(actual_values.to_string)
print("-" * 30)  
print("Next 10 forecasted windspeed values:")
print(next10_values.to_string)
print("-" * 30)  

print("Last actual value:")
print(last_value.to_string)
print("-" * 30)  

print("Next forecasted value:")
print(forecast_value.to_string)

The difference in the single forecasted value compared to the subsequent ten forecasted values may arise due to the model's estimation. When requesting only the next value, the model is making a prediction solely for that immediate step, whereas when requesting the next ten values, the model might consider longer-term trends and patterns, resulting in potentially different forecasts.
Adjusting the model parameters or training it with different configurations might influence these variations. 

### Forecast Errors

In [None]:
forecast_errors = []

for i in range(10):
    forecast = fitted_model.forecast(steps=1)
    
    forecast_errors.append(actual_values.values[i] - forecast)
    
    fitted_model = sm.ARIMA(pd.concat([df['windspeed'].head(len(df) - 10 + i), actual_values.head(i)]), order=(4, 0, 4)).fit()

print("Forecast Errors:")
print(forecast_errors)

### Model Evaluating

In [None]:
mae = mean_absolute_error(actual_values, next10_values)

rmse = mean_squared_error(actual_values, next10_values, squared=False)

r_squared = r2_score(actual_values, next10_values)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r_squared}")

### Forecast on different parameters

In [None]:
p_new = 1 
d_new = 0 
q_new = 4

forecasted_values_df_2 = pd.DataFrame()

for i in range(10):
    fitted_model2 = sm.ARIMA(pd.concat([df['windspeed'].head(len(df) - 10 + i), actual_values.head(i + 1)]), order=(p_new, d_new, q_new)).fit()
    
    next10_values_2 = fitted_model2.forecast(steps=10)
    
    next10_values_2.index = range(actual_values.index[-1] + 1, actual_values.index[-1] + 11)
    
    forecasted_values_df_2 = pd.concat([forecasted_values_df_2, next10_values_2])

forecasted_values_df_2.reset_index(drop=True, inplace=True)

In [None]:
last_value = df['windspeed'].tail(1)

fitted_model2 = sm.ARIMA(df['windspeed'], order=(p_new, d_new, q_new)).fit() 

forecast_value_new = fitted_model2.forecast(steps=1)
forecast_value_new.index = [actual_values.index[-1] + 1]

In [None]:
print("Last 10 actual values:")
print(actual_values.to_string())
print("-" * 30)

print("Next 10 forecasted windspeed values:")
print(next10_values_2.to_string())
print("-" * 30)

print("Last actual value:")
print(last_value.to_string())
print("-" * 30)

print("Next forecasted value:")
print(forecast_value_new.to_string())

In [None]:
forecast_errors = []

for i in range(10):
    forecast = fitted_model2.forecast(steps=1)
    
    forecast_errors.append(actual_values.values[i] - forecast)
    
    fitted_model2 = sm.ARIMA(pd.concat([df['windspeed'].head(len(df) - 10 + i), actual_values.head(i)]), order=(p_new, d_new, q_new)).fit() 

print("Forecast Errors:")
print(forecast_errors) 

In [None]:
mae = mean_absolute_error(actual_values, next10_values2)
rmse = mean_squared_error(actual_values, next10_values2, squared=False)
r_squared = r2_score(actual_values, next10_values2)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r_squared}")

### Visualisation

#### Last 10 values vs one-step-ahead value

In [None]:
plt.figure(figsize=(10, 6))
forecast_index = actual_values.index[-1] + 1

plt.plot(actual_values.index, actual_values, label='Actual', color='blue') 

plt.plot([actual_values.index[-1], forecast_index], [actual_values.iloc[-1], forecast_value.iloc[0]], label='Forecast', color='red')

plt.xlabel('Time')
plt.ylabel('Wind speed')
plt.title('Last 10 values vs Forecasted Value')
plt.legend()

plt.show()

#### Last 10 values vs forecasted 10 values

In [None]:
plt.figure(figsize=(10, 6))

plt.plot(actual_values.index, actual_values, label='Actual', color='blue')

forecast_index = actual_values.index[-1] + 1

forecast_values_index = [actual_values.index[-1]] + list(range(forecast_index, forecast_index + 10))
forecast_values = [actual_values.iloc[-1]] + list(next10_values)

plt.plot(forecast_values_index, forecast_values, label='Next 10 Forecast', color='red', marker='o')

plt.xlabel('Time')
plt.ylabel('Values')
plt.title('Actual vs Next 10 Forecasted Values')
plt.legend()

plt.show()

# 

### Conclusion

The Mean Absolute Error (2.62) and Root Mean Squared Error (3.22) values suggest the model's predictive accuracy in terms of absolute and squared errors, but it still show that the model's performance might have room for improvement.

Lower values for MAE and RMSE are better, indicating smaller prediction errors. For R², a value closer to 1 is better, as it suggests the model explains a larger portion of the variability in windspeed. However, in my case, the negative R² indicates that my model might not be a great fit for this specific data, possibly due to the inadequacy of the model or the complexity of the underlying patterns in the data.


Nature of the variable I used for the predicting can significantly impact the model's performance. Wind speed data often exhibits high variability and can be influenced by various complex factors like weather patterns, geography, and seasonal variations. The inherent variability in wind speed data can make it challenging to capture precise patterns, impacting the model's predictive accuracy. considering the complexities in wind data, advanced modeling techniques should be explored, potentialy incorporating additional relevant features. 

I was testing this model with different parameters, scores were pretty much the same, even worse than these.

I did different approaches, changed size of dataset few times (tried with dataset based on weekly average for all the years, then with dataset for one year only due to computational requirements just to finish with complete dataset, for all the years to make model better on training.)

Also, I was testing different models and different features: I was working on temperature feature where I was using SARIMA, but had to give up due to computational requirements and seasonality of temperature dataset (I tried reducing datasets but it was still taking too long). 

At the end, I downloaded new dataset completely and was considering to try in that way, by bringing stock values dataset and performing ARMA model, but decided to stay with this one and not to change dataset for every task.


Even though my model provides some predictive capability, the scores suggest considering alternative modeling techniques to better capture complexities in the data. 

# 

# Text Analytics

### Importing dataset

In [None]:
df = pd.read_csv('climate_change_tweets.csv')
df.head(5)

In [None]:
df. info()

### Text Preprocessing

#### Converting text to lowercase

In [None]:
df['Lowercase_Text'] = df['Embedded_text'].apply(lambda x: x.lower())

print("Lowercase Text:")
print(df['Lowercase_Text'].head())
print("---------------------------------------------")

#### Removing special characters

In [None]:
def remove_special_chars(text):
    text = re.sub(r'http\S+|www\S+|@[^\s]+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

df['No_Special_Chars_Text'] = df['Lowercase_Text'].apply(remove_special_chars)

print("Text without Special Characters, URLs, and Usernames:")
print(df['No_Special_Chars_Text'].head())
print("---------------------------------------------")

#### Tokenizing text

In [None]:
df['Tokenized_Text'] = df['No_Special_Chars_Text'].apply(word_tokenize)

print("Tokenized Text:")
print(df['Tokenized_Text'].head(10))
print("---------------------------------------------")

#### Removing stopwords

In [None]:
stop_words = set(stopwords.words('english'))
df['Without_Stopwords_Text'] = df['Tokenized_Text'].apply(lambda tokens: [token for token in tokens if token not in stop_words])

print("Text without Stopwords:")
print(df['Without_Stopwords_Text'].head(10))
print("---------------------------------------------")

### Text Categorisation / Sentiment Analysis

#### Data Labeling

In order not to do manual labeling or importing external labeled datasets, I will use Unsupervise Sentiment Analyisis. More precisely, I will use TextBlob to calculate sentiment polarity scores for each tweet text. The 'Sentiment_TextBlob' column will contain the polarity scores ranging from -1 (negative) to 1 (positive), indicating the sentiment polarity of the text.

In [None]:
df['Sentiment_TextBlob'] = df['Embedded_text'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Function to map sentiment scores to categorical labels
def map_sentiment(score):
    if score > 0.1:
        return 'positive'
    elif score < -0.1:
        return 'negative'
    else:
        return 'neutral'
    
print(df[['Embedded_text', 'Sentiment_TextBlob']].head(10))

I will create discrete categorical labels directly from the sentiment polarity scores.

In [None]:
df['Sentiment'] = df['Sentiment_TextBlob'].apply(map_sentiment)
print(df[['Embedded_text', 'Sentiment']].head(10))

#### Feature Extraction

After I have obtained the labeled dataset with sentiment scores assigned to the tweets, the next step would be feature extraction from the preprocessed text data. This step involves transforming the textual data into numerical features that machine learning models can understand and utilise for training. I will use TF-IDF method.

In [None]:
df['Processed_Text'] = df['Without_Stopwords_Text'].apply(lambda x: ' '.join(x))

In [None]:
tfidf_vectorizer = TfidfVectorizer()

In [None]:
tfidf_features = tfidf_vectorizer.fit_transform(df['Processed_Text'])
print("TF-IDF Matrix Shape:", tfidf_features.shape)

#### Model Training


For this step, I'll use the Multinomial Naive Bayes classifier, a common choice for text classification tasks.

In [None]:
X = tfidf_features
y = df['Sentiment']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

#### Predictions and Evaluations

In [None]:
predictions = nb_classifier.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, predictions))

The model's accuracy of approximately 59.61% suggests that it correctly predicts the sentiment for around 59.61% of the tweets in the test set. However, the classification report reveals that the model's performance varies across different sentiment classes.

It appears that the model has challenges distinguishing 'negative' sentiments, as it has very low recall and F1-score for this class, indicating that it correctly identifies very few 'negative' sentiments compared to the actual instances present in the test set.

#### Improving model scores

Since dataset suffers from significant class imbalance (fewer 'negative' sentiments), addressing this issue could be beneficial and I will do it by using Oversampling technique, to help balance the representation of different sentiment classes, potentially improving the model's ability to learn from underrepresented classes.

In [None]:
oversample = RandomOverSampler(sampling_strategy='minority')
X_train_oversampled, y_train_oversampled = oversample.fit_resample(X_train, y_train)

#### Retraining the model

In [None]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_oversampled, y_train_oversampled)

In [None]:
predictions = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, predictions))

The drop in accuracy might indicate that the model, after oversampling, struggles to generalize well to unseen data or that the approach to balancing the classes did not sufficiently improve the model's overall performance.

I will try with Fine-tuning Hyperparameters.

#### Fine-tuning Hyperparameters

In [None]:
nb_classifier = MultinomialNB()

In [None]:
alpha_values = [0.1, 0.5, 1.0, 1.5, 2.0]

In [None]:
param_grid = {'alpha': alpha_values}

In [None]:
grid_search = GridSearchCV(nb_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_oversampled, y_train_oversampled)

In [None]:
best_alpha = grid_search.best_params_['alpha']
best_model = grid_search.best_estimator_

In [None]:
best_model.fit(X_train_oversampled, y_train_oversampled)

In [None]:
predictions = best_model.predict(X_test)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, predictions))

#### Exploring Advanced Models


Advanced models might capture more complex relationships within the text data, allowing for better representation of sentiments. I will try to improve scores with SVM Model.

In [None]:
svm_classifier = SVC(kernel='linear', C=1.0)

In [None]:
svm_classifier.fit(X_train_oversampled, y_train_oversampled)

In [None]:
predictions_svm = svm_classifier.predict(X_test)

In [None]:
accuracy_svm = accuracy_score(y_test, predictions_svm)
report_svm = classification_report(y_test, predictions_svm)

print(f"Accuracy (SVM): {accuracy_svm}")
print("Classification Report (SVM):")
print(report_svm)

The SVM model's overall performance, as indicated by accuracy and F1-scores, has shown significant enhancement compared to the previous model attempts.

#### Hyperparameter Tuning for SVM

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'], 
}

In [None]:
#svm_classifier = SVC()

In [None]:
#grid_search = GridSearchCV(svm_classifier, param_grid, cv=5, scoring='accuracy')
#grid_search.fit(X_train_oversampled, y_train_oversampled)

In [None]:
#best_params = grid_search.best_params_
#best_model_svm = grid_search.best_estimator_

In [None]:
#best_model_svm.fit(X_train_oversampled, y_train_oversampled)

In [None]:
#predictions_svm_tuned = best_model_svm.predict(X_test)
#accuracy_svm_tuned = accuracy_score(y_test, predictions_svm_tuned)
#report_svm_tuned = classification_report(y_test, predictions_svm_tuned)

#print(f"Best Hyperparameters: {best_params}")
#print(f"Accuracy (Tuned SVM): {accuracy_svm_tuned}")
#print("Classification Report (Tuned SVM):")
#print(report_svm_tuned)

The overall performance of the tuned SVM model, as indicated by accuracy and F1-scores, shows some improvement compared to the default SVM model without hyperparameter tuning but it seems that's the best accuracy can go.

# 

### Topic Modeling

Topic modeling is a technique used to discover abstract topics present in a collection of documents. One popular algorithm for topic modeling is Latent Dirichlet Allocation (LDA).

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

#### Preprocessing Data

In [None]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    tokens = word_tokenize(text.lower())
    
    tokens = [lemmatizer.lemmatize(token) for token in tokens 
              if token not in stop_words and token not in string.punctuation]

    return ' '.join(tokens) if tokens else ''

In [None]:
df['Processed_Text'] = df['Embedded_text'].fillna('').apply(preprocess_text)

In [None]:
vectorizer = CountVectorizer(max_features=5000)  # Adjust max_features as needed
doc_term_matrix = vectorizer.fit_transform(df['Processed_Text'])
print(f"Shape of Document-Term Matrix: {doc_term_matrix.shape}")

#### Applying Latent Dirichlet Allocation (LDA) for Topic Modeling

In [None]:
num_topics = 5

In [None]:
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)

In [None]:
lda.fit(doc_term_matrix)

#### Extracting Topics and Associated Words

In [None]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(", ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))
        print()

In [None]:
num_top_words = 10 
feature_names = vectorizer.get_feature_names_out()
print(f"Top {num_top_words} words for each topic:")
display_topics(lda, feature_names, num_top_words)

#### Visualisation

In [None]:
def generate_wordclouds(model, feature_names):
    num_topics = len(model.components_)
    for topic_idx, topic in enumerate(model.components_):
        word_freq = {feature_names[i]: topic[i] for i in topic.argsort()[:-15 - 1:-1]} 
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
        plt.figure(figsize=(8, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(f"Word Cloud - Topic {topic_idx + 1}")
        plt.show()

generate_wordclouds(lda, feature_names)

#### Topic_based Analysis

In [None]:
def assign_dominant_topic(model, doc_term_matrix):
    dominant_topics = model.transform(doc_term_matrix)
    return dominant_topics.argmax(axis=1) + 1  # Adding 1 to start topics from index 1

df['Dominant_Topic'] = assign_dominant_topic(lda, doc_term_matrix)

topic_counts = df['Dominant_Topic'].value_counts().sort_index()
print("Tweet Count in Each Dominant Topic:")
print(topic_counts)

In [None]:
fig = px.bar(topic_counts, x=topic_counts.index, y=topic_counts.values, labels={'x': 'Dominant Topic', 'y': 'Tweet Count'},
             title='Tweet Count in Each Dominant Topic')
fig.update_traces(marker_color='skyblue')
fig.show()

#### Sentiment Analysis

In [None]:
def analyze_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity == 0:
        return 'Neutral'
    else:
        return 'Negative'

In [None]:
for topic in range(1, 6):
    topic_tweets = df[df['Dominant_Topic'] == topic]['Embedded_text']
    df[f'Topic_{topic}_Sentiment'] = topic_tweets.apply(analyze_sentiment)

In [None]:
for topic in range(1, 6):
    print(f"Sentiment Distribution in Topic {topic}:")
    print(df[f'Topic_{topic}_Sentiment'].value_counts())
    print()

Topics 4 and 5 predominantly exhibited a higher volume of positive sentiments compared to negative sentiments, with Topic 4 displaying the highest positive sentiment count. Conversely, Topic 4 also had a notable count of negative sentiments, showcasing a more balanced sentiment distribution. Topics 1, 2, and 3 showcased varying degrees of positive, negative, and neutral sentiments, with Topic 3 displaying a higher count of positive sentiments. Overall, the sentiment distributions indicate a mixed sentiment landscape surrounding climate change discussions, encompassing positive, negative, and neutral expressions across the identified topics.

#### Temporal Trends

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [None]:
df.set_index('Timestamp', inplace=True)

In [None]:
resampled_data = df['Text'].resample('D').count().reset_index()

In [None]:
fig = px.line(resampled_data, x='Timestamp', y='Text', title='Tweet Volume Related to Climate Change Over Time')
fig.update_xaxes(title='Time')
fig.update_yaxes(title='Tweet Count')
fig.show()

As we can see from the plot, there was significant spike in tweets on july 12th, 2022 with 175 tweets. 

# 

### Document Summarisation


Document summarization involves condensing a document's main points or content into a shorter representation while preserving its essence.

#### Extractive Summarization with TextRank Algorithm

In [None]:
summary = summarize(text, ratio=0.2)

print("Extractive Summary:")
print(summary)