## Sentiment analysis

In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots

import praw
import time

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import FreqDist
from nltk.util import ngrams

nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to C:\Users\Hung
[nltk_data]     Nguyen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Hung
[nltk_data]     Nguyen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# reddit = praw.Reddit(
#     client_id="i5bJftVVlTL8S0sLfNvtNA",
#     client_secret="vHawak1ZgPyEwCoe76Xyw_8TvR0GBQ",
#     user_agent="nlp",
#     username="Babe_My_Name_Is_Hung",
# )

# post_ids = ["bviwz5", "10vokml", "uuw4sl", "16sjcbv", "1csgm19", "1eojg9h", "1apjlja", "18ms0iq", "187bmst", "159x06b", "1at29fj", "17hfqfb", "1eqmzno"]
# comments = []

# for post_id in post_ids:
#     submission = reddit.submission(post_id)
#     submission.comments.replace_more(limit=None)
#     for comment in submission.comments.list():
#         comments.append(comment.body)
#     time.sleep(0.1)

In [3]:
comments_df = pd.read_csv("data/reddit_comments.csv")
comments = comments_df["text"].to_list()

In [4]:
stop_words = set(stopwords.words("english"))
stop_words.update(["like", "people", "vietnam", "vietnamese", "even", "one", "get", "deleted", "ye", "yo", "gon", "na", "got", "ta", "sure", "fuck", "shit", "asshole"])
stemmer = PorterStemmer()

In [5]:
def preprocess(text: str, return_tokens=True):
    # Remove newline characters
    text = text.replace("\n", " ")
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    tokens = word_tokenize(text)
    
    tokens = [
        stemmer.stem(word.lower()) for word in tokens
        if word.isalpha() and word.lower() not in stop_words
    ]
    if return_tokens:
        return tokens
    return " ".join(tokens)

In [6]:
def ngram_frequencies(tokens: list):
    bigrams = FreqDist(ngrams(tokens, 2))
    trigrams = FreqDist(ngrams(tokens, 3))
    return bigrams, trigrams

In [18]:
def plot_freqdist(freq_dist, title, plot_title, num=10):
    common_terms = freq_dist.most_common(num)
    terms, counts = zip(*common_terms)
    terms = [" ".join(term) if isinstance(term, tuple) else term for term in terms]
    fig = px.bar(x=terms, y=counts, labels={"x": "Terms", "y": "Frequency"})
    fig.update_layout(
        title=title,
        xaxis_tickangle=-45,
        width=700,  
        height=400,  
        margin=dict(l=40, r=40, t=40, b=40), 
        paper_bgcolor="#0E1117",
        plot_bgcolor="#0E1117",
    )
    fig.write_html(f"plots/{plot_title}.html")
    fig.show()

In [8]:
tokenized_combined_df = [token for text in comments for token in preprocess(text)]
bigrams_freq, trigrams_freq = ngram_frequencies(tokenized_combined_df)

In [9]:
bigrams_freq_dict = {' '.join(string): count for string, count in zip(bigrams_freq.keys(), bigrams_freq.values())}
trigrams_freq_dict = {' '.join(string): count for string, count in zip(trigrams_freq.keys(), trigrams_freq.values())}

bigrams_freq_dict.update(trigrams_freq_dict)

In [10]:
from wordcloud import WordCloud

wordcloud = WordCloud(width=1500, height=400, background_color='black', colormap='Blues', relative_scaling=0.7).generate_from_frequencies(bigrams_freq_dict)
wordcloud.to_file("plots/combined_wordcloud.png")

# plt.figure(figsize=(10, 5))
# plt.imshow(wordcloud, interpolation = "bilinear")
# plt.axis('off')
# plt.show()

<wordcloud.wordcloud.WordCloud at 0x2b42e9a11d0>

In [11]:
wordcloud = WordCloud(width=800, height=400, background_color='black', colormap='hot').generate_from_frequencies(trigrams_freq_dict)
wordcloud.to_file("plots/trigram_wordcloud.png")

<wordcloud.wordcloud.WordCloud at 0x2b42e97b3d0>

In [3]:
import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = "plotly_dark"

In [19]:
plot_freqdist(bigrams_freq, title="Bigram Frequency Distribution", plot_title="bigram_freq", num=13)

In [20]:
plot_freqdist(trigrams_freq, "Trigram Frequency Distribution", plot_title="trigram_freq", num=13)

In [21]:
cleaned_comments = [preprocess(text, return_tokens=False) for text in comments]
cleaned_comments = [text for text in cleaned_comments if text.strip() != ""]

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import davies_bouldin_score
import numpy as np

vectorizer = TfidfVectorizer(tokenizer=lambda text: text.split(), lowercase=False, min_df=0.01, norm='l2', ngram_range=(2, 3))
vectorized_comments = vectorizer.fit_transform(cleaned_comments).toarray()

score_list = []
pred_clusters_list = []

for k in range(5, 20):
    clsr = AgglomerativeClustering(n_clusters=k) 
    # clsr = KMeans(n_clusters=k, random_state=129, n_init=1) 
    pred_cluster = clsr.fit_predict(vectorized_comments)
    pred_clusters_list.append(pred_cluster)
    score_list.append(davies_bouldin_score(vectorized_comments, pred_cluster))

pred_clusters = pred_clusters_list[np.argmax(score_list)]
k = list(range(5, 20))[np.argmax(score_list)]


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



In [23]:
import seaborn as sns
from nltk.sentiment import SentimentIntensityAnalyzer

combined_df_list = []
sia = SentimentIntensityAnalyzer()

for topic in range(k):
    clustered_documents = [cleaned_comments[i] for i in range(len(cleaned_comments)) if pred_clusters[i] == topic] 
    sentiment_scores = [sia.polarity_scores(cleaned_comments[i])["compound"] for i in range(len(cleaned_comments)) if pred_clusters[i] == topic] 
    combined_text = ' '.join(clustered_documents)
    mean_scores = np.mean(sentiment_scores)
    sentiment = "Positive" if mean_scores >= 0 else "Negative"

    bigrams = FreqDist(word for word in ngrams(word_tokenize(combined_text), 2)).most_common(10)
    trigrams = FreqDist(word for word in ngrams(word_tokenize(combined_text), 3)).most_common(10)

    bigrams = np.array([(" ".join(string), freq) for string, freq in bigrams])
    trigrams = np.array([(" ".join(string), freq) for string, freq in trigrams])

    l2_df = pd.DataFrame(bigrams, columns=["Terms", "Frequency"])
    l3_df = pd.DataFrame(trigrams, columns=["Terms", "Frequency"])
    combined_df = pd.concat([l2_df, l3_df], axis=0)
    combined_df["Frequency"] = pd.to_numeric(combined_df["Frequency"])
    combined_df = combined_df.sort_values(["Frequency"], ascending=False).head(10)
    combined_df["Topic"] = topic + 1
    combined_df["Sentiment"] = sentiment
    combined_df_list.append(combined_df)

combined_topic_df = pd.concat(combined_df_list, axis=0).reset_index(drop=True)
# fig = sns.catplot(data=combined_topic_df, kind="bar", x="Frequency", y="Terms", col="Topic", col_wrap=4, sharey=False, sharex=False, height=3.5, aspect=1.5)
# plt.show()

In [24]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=4, cols=3, subplot_titles=combined_topic_df['Topic'].tolist(), horizontal_spacing=0.12, vertical_spacing=0.05)
color_mapping = {"Negative": "#ad3232", "Positive": "#358545"}

for i in range(len(combined_topic_df)):
    fig.add_trace(
        go.Bar(
            x=[combined_topic_df['Frequency'][i]], 
            y=[combined_topic_df['Terms'][i]], 
            name=str(combined_topic_df['Topic'][i]),
            marker_color=color_mapping[combined_topic_df['Sentiment'][i]],
            orientation='h',
            showlegend=False
        ),
        row=combined_topic_df['Topic'][i] % 4 if combined_topic_df['Topic'][i] % 4 != 0 else 4, 
        col=combined_topic_df['Topic'][i] % 3 if combined_topic_df['Topic'][i] % 3 != 0 else 3,
    )
    # fig.update_traces(marker_color='#1f77b4')

for i in range(len(combined_topic_df)):
    fig['layout']['annotations'][combined_topic_df['Topic'][i] - 1]['text'] = f"Topic {combined_topic_df['Topic'][i]}"
    fig.update_yaxes(
        autorange='reversed', 
        row=combined_topic_df['Topic'][i] % 4 if combined_topic_df['Topic'][i] % 4 != 0 else 4, 
        col=combined_topic_df['Topic'][i] % 3 if combined_topic_df['Topic'][i] % 3 != 0 else 3
    )
fig['layout']['annotations'][11]['text'] = ""

for sentiment, color in color_mapping.items():
    fig.add_trace(
        go.Bar(
            x=[None],  # Dummy data for the legend entry
            y=[None],
            marker_color=color,
            name=sentiment.capitalize(),  
            showlegend=True 
        )
    )

fig.update_layout(
    title="Most frequent terms (bi and tri-grams) for each topic",
    title_y=0.98,
    margin=dict(l=40, r=40, t=110, b=40), 
    height=1000, 
    width=1300, 
    showlegend=True,
    legend=dict(orientation='h', yanchor='top', xanchor='center', y=1.08, x=0.5),
    legend_title_text="Sentiment",
    paper_bgcolor="#0E1117",
    plot_bgcolor="#0E1117",
)

fig.write_html("plots/topic_model.html")
fig.show()

## Traffic situation

### Congestion Visualization



https://www.quora.com/How-bad-is-Vietnams-traffic-jam-compared-to-Thailand

In [25]:
import branca.colormap as cm

colormap = cm.LinearColormap(colors=list(reversed(cm.linear.RdYlGn_11.scale(0, 1).colors)))

In [3]:
import pandas as pd
import folium
from folium.plugins import MarkerCluster
import branca.colormap as cm
from datetime import datetime

# Load the datasets
nodes = pd.read_csv("data/traffic_flow/nodes.csv")
segments = pd.read_csv("data/traffic_flow/segments.csv")
segment_status = pd.read_csv("data/traffic_flow/segment_status.csv")

# Rename columns in nodes.csv for clarity
nodes.rename(columns={"_id": "node_id", "long": "longitude", "lat": "latitude"}, inplace=True)

# Merge segment data with node data for start and end nodes
segments = segments.merge(
    nodes, left_on="s_node_id", right_on="node_id", suffixes=("", "_start")
).rename(columns={"latitude": "lat_start", "longitude": "long_start"})
segments = segments.merge(
    nodes, left_on="e_node_id", right_on="node_id", suffixes=("", "_end")
).rename(columns={"latitude": "lat_end", "longitude": "long_end"})

# Merge with segment_status to include traffic velocity
traffic_data = pd.merge(segment_status, segments, left_on="segment_id", right_on="_id")

# Convert the 'updated_at' column to datetime
traffic_data['updated_at'] = pd.to_datetime(traffic_data['updated_at_x'])

# Extract hour and minute information from the datetime column
traffic_data['hour'] = traffic_data['updated_at'].dt.hour
traffic_data['minute'] = traffic_data['updated_at'].dt.minute

# Define the time range (6 AM to 10 PM), focusing on 30-minute periods (half-hour blocks)
traffic_data = traffic_data[(traffic_data['hour'] >= 3) & (traffic_data['hour'] <= 18)]
traffic_data['period'] = (traffic_data['hour'] * 60 + traffic_data['minute']) // 30  # Group by half-hour periods

# Define the discrete colormap with 4 ranges: Green (Low), Yellow (Moderate), Orange (High), Red (Very High)
colors = ['green', 'yellow', 'orange', 'red']  # 4 colors for 4 ranges
colormap = cm.StepColormap(colors, vmin=0, vmax=1, index=[0, 0.25, 0.5, 0.75, 1])

# Define the time periods (half-hour blocks from 6 AM to 10 PM)
time_periods = sorted(traffic_data['period'].unique())

# Create a map for each time period and save it
for period in time_periods:
    # Filter the data for the current time period
    period_data = traffic_data[traffic_data['period'] == period]
    
    # Compute min and max velocity for the current time period
    min_velocity = period_data["velocity"].min()
    max_velocity = min(period_data["velocity"].max(), 50)
    
    # Normalize the traffic intensity within the current time period
    period_data["intensity_normalized"] = period_data["velocity"].apply(
        lambda x: 1.0 - (min(x, 50) - min_velocity) / (max_velocity - min_velocity) if max_velocity != min_velocity else 0
    )

    # Create the map centered around the average node coordinates
    map_center = [nodes["latitude"].mean(), nodes["longitude"].mean()]
    m = folium.Map(
        location=map_center, 
        zoom_start=12, 
        tiles='openstreetmap', 
        # zoom_control=False,
        # scrollWheelZoom=False,
        # dragging=False
    )

    # Add traffic segments as polylines
    for _, row in period_data.iterrows():
        start = [row["lat_start"], row["long_start"]]
        end = [row["lat_end"], row["long_end"]]

        # Define color based on the normalized intensity
        intensity = row["intensity_normalized"]
        color = colormap(intensity)  # Get the color based on the intensity level
        
        # Add polyline for this traffic segment
        folium.PolyLine(
            locations=[start, end],
            color=color,
            weight=5,
            opacity=0.6,
            tooltip=(
                f"Street: {row['street_name']}<br>"
                f"Velocity: {row['velocity']} km/h<br>"
                f"Max Velocity: {row['max_velocity']} km/h<br>"
                f"Length: {row['length']} m"
            ),
        ).add_to(m)

    # Add the colormap legend
    colormap.caption = f"Traffic Intensity (Period {period}): Green (Low), Red (Very High)"
    colormap.add_to(m)

    # Save the map as an HTML file
    time_label = f"{(period // 2) + 3}_{'00' if period % 2 == 0 else '30'}"
    if period // 2 + 3 < 10:
        time_label = '0' + time_label
    m.save(f"plots/traffic_flow/traffic_map_period_{time_label}.html")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  period_data["intensity_normalized"] = period_data["velocity"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  period_data["intensity_normalized"] = period_data["velocity"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  period_data["intensity_normalized"] = period_data["velocity"].apply

### Traffic Accidents

In [4]:
traffic_accident_df = pd.read_csv("data/traffic-acc-2013-2023.csv", names=["Year", "Deaths", "TotalAccidents"])
traffic_accident_df["Year"] = traffic_accident_df["Year"].astype(str)
df_melted = pd.melt(traffic_accident_df, 
                    id_vars=['Year'], 
                    value_vars=['Deaths', 'TotalAccidents'],
                    var_name='Category', 
                    value_name='Accidents')

df_melted['Category'] = df_melted['Category'].replace({'Deaths': 'Death', 'TotalAccidents': 'Total Accidents'})

color_mapping = {
    'Death': 'red',  
    'Total Accidents': '#1f77b4'  
}
fig = make_subplots(rows=1, cols=1)
fig = px.area(df_melted, x="Year", y="Accidents", markers=".", color="Category", color_discrete_map=color_mapping, title="Traffic Accidents from 2013-2023 (in thousands)")
fig.update_layout(
    width=800,  
    height=400,  
    margin=dict(l=40, r=40, t=40, b=40),
    paper_bgcolor="#0E1117",
    plot_bgcolor="#0E1117",
)
fig.write_html("plots/traffic_accidents.html")
fig.show()

## Causes

### Poor infrastructure

### Motorbike Ownership

https://www.statista.com/forecasts/1279573/vietnam-motorcycle-number-volume-market

https://www.statista.com/statistics/1337061/vietnam-most-used-modes-of-transportation/

https://tuoitrenews.vn/news/society/20231224/vietnam-crowned-king-of-motorbikes-in-southeast-asia/77435.html

In [27]:
df_motor_own = pd.read_csv('data/motor_own_country.csv')
df_volume = pd.read_csv('data/volume_motorbikes.csv')
df_trans_percent = pd.read_csv('data/transportation_mode_percent.csv')


df_volume_pred = df_volume[(df_volume["Year"] >= 2023)& (df_volume["Year"] <= 2026)]
df_volume = df_volume[df_volume["Year"] < 2024]
df_volume["Year"] = df_volume["Year"].astype(str)

In [28]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df_volume["Year"],
        y=df_volume["Count"],
        mode="lines+markers",
        name="Actual Volume",
        line=dict(color="#1f77b4")
    )
)

fig.add_trace(
    go.Scatter(
        x=df_volume_pred["Year"],
        y=df_volume_pred["Count"],
        mode="lines+markers",
        name="Predicted Volume",
        line=dict(color="red")
    )
)

fig.update_layout(
    title="Production volume of motorcycles in Vietnam",
    width=1200,  
    height=400,  
    margin=dict(l=40, r=40, t=40, b=40), 
    yaxis=dict(autorange="reversed", title="Count (in 1000)"),
    xaxis=dict(title="Year"),
    legend=dict(title="Legend"),
    paper_bgcolor="#0E1117",
    plot_bgcolor="#0E1117",
)
fig.write_html("plots/annual_volume.html")
fig.show()

In [29]:
fig = px.bar(df_motor_own, x="Percentage", y="Country", orientation='h')
fig.update_layout(
    title="Percentage of ownership of motorbikes in Asia",
    width=700,  
    yaxis=dict(autorange="reversed"),
    height=400,  
    margin=dict(l=40, r=40, t=40, b=40), 
    paper_bgcolor="#0E1117",
    plot_bgcolor="#0E1117",
)
fig.write_html("plots/ownership_asia.html")
fig.show()

In [30]:
fig = px.bar(df_trans_percent, x="Percentage", y="Vehicle", orientation='h')
fig.update_layout(
    title="Percentage of tranportation mode in Vietnam",
    width=700,  
    yaxis=dict(autorange="reversed"),
    height=400,  
    margin=dict(l=40, r=40, t=40, b=40), 
    paper_bgcolor="#0E1117",
    plot_bgcolor="#0E1117",
)
fig.write_html("plots/transport_mode_percent.html")
fig.show()

### Population Density

In [31]:
import h3
import pandas as pd
from shapely.geometry import Polygon
from geojson import Feature, Point, FeatureCollection
import numpy as np

# Load your data
df = pd.read_csv('data/population-density.csv')

In [5]:
class h3_grid():
    def __init__(self, resolution=4):
        self.resolution = resolution
        self.grid = False

    def row_to_h3cell(self, row) -> str:
        return h3.latlng_to_cell(lat=row['Y'], lng=row['X'], res=self.resolution)
        
    def fit(self, df_input: pd.DataFrame):
        df = df_input.copy()
        df['h3_cell'] = df.apply(self.row_to_h3cell, axis=1)
        df = df.groupby('h3_cell').agg({"Z": "sum"}).reset_index()
        df = df.rename(columns={"Z": "count"})
        self.grid = df
  
def hex_to_geojson(df_hex, hex_id_field, geometry_field, value_field):
    list_features = []
    for i, row in df_hex.iterrows():
        feature = Feature(
            geometry=row[geometry_field],
            id=row[hex_id_field],
            properties={"value": row[value_field]},
        )
        list_features.append(feature)
        feat_collection = FeatureCollection(list_features)
    return feat_collection

def add_geometry(row):
    points = h3.cell_to_boundary(row["h3_cell"])
    return Polygon(points)

def load_geojson(df_h3):
    df_h3["geometry"] = df_h3.apply(add_geometry, axis=1)
    geojson_object = hex_to_geojson(
        df_h3, hex_id_field="h3_cell", value_field="count", geometry_field="geometry"
    )
    return geojson_object

def flip_coordinates(geojson_data):
    for feature in geojson_data['features']:
        for i, coordinates in enumerate(feature['geometry']['coordinates']):
            # Iterate through the coordinates and swap them
            for j in range(len(coordinates)):
                coordinates[j] = [coordinates[j][1], coordinates[j][0]]  # [longitude, latitude]
    return geojson_data

In [6]:
grid_object = h3_grid()
grid_object.fit(df)

df_h3 = grid_object.grid.sort_values('count')
geojson_object = load_geojson(df_h3)

# Flip the coordinates in the GeoJSON
flipped_geojson = flip_coordinates(geojson_object)

In [7]:
fig = px.choropleth_mapbox(
    df_h3,
    geojson=flipped_geojson,
    locations="h3_cell",
    color=pd.cut(df_h3["count"], bins=[0, int(1e5), int(5e5), int(1e6), np.inf]).astype(str),
    color_discrete_map={
        "(0.0, 100000.0]": "#1a02b8",
        "(100000.0, 500000.0]": "#ae22b3",
        "(500000.0, 1000000.0]": "red",
        "(1000000.0, inf)": "yellow",
    },
    hover_data=["count"],
    center=dict(lat=df['Y'].mean(), lon=df['X'].mean()),
    zoom=5,
    width=600,
    height=800,
    opacity=0.5,
    labels={"color": "Population Count", "count": "Population Count"},
    mapbox_style="open-street-map",
)

fig.update_geos(projection_type="foucaut")

fig.update_layout(
    legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
    autosize=False,
    margin=dict(l=0, r=0, b=0, t=0, pad=4, autoexpand=True),
    dragmode=False,
)
fig.write_html("plots/population_density.html")
fig.show()