In [1]:
!pip install streamlit
!pip install streamlit pyngrok
!pip install pyngrok

Collecting streamlit
  Downloading streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.42.0-py2.py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[

#web scrape

In [None]:
# if needed: pip install requests or conda install requests
import requests
from bs4 import BeautifulSoup
import lxml.html as lh
import pandas as pd


def find_console_tags(soup):
    # Console tags are stored as images, so we find the image tag and record its 'alt' value as text
    consoles = list()
    for img in soup.find_all('img'):
        if 'images/consoles'in img['src']:
            # Cut file path elements from string
            console_tag = (img['src'][17:-6])
            consoles.append(img['alt'])
    return consoles


# Find the names of games from the links
def find_names_column(table_path):
    names_list = list()
    for row in table_path.xpath('.//tr'):
        for td in row.xpath('.//td'):
            if not td.find('a') is None:
                names_list.append(td.find('a').text.strip())
    return names_list

# Write a function that takes in a VGChartz URL and gives us all the data in their video game database
def scrape_vgchartz_videogame_db_page(url):

    response = requests.get(url)

    ### Check the Status
    assert(response.status_code == 200)," Website not OK " # status code = 200 => OK

    #Store the contents of the website under doc
    page=response.text
    soup = BeautifulSoup(page, "lxml")
    doc = lh.fromstring(response.content)

    # Selects the table with all the data in it on HTML using xpath
    target_table_path = doc.xpath('//*[@id="generalBody"]/table')[0]

    # Find column values that won't be scraped correctly with .text option
    names_list = find_names_column(target_table_path)
    consoles = find_console_tags(soup)

    # Parse non-image and non-URL info from the data table to a pandas DataFrame
    row_dict={}
    df=pd.DataFrame()
    row_list= list()
    for counter,row in enumerate(target_table_path.xpath(".//tr")):
        if counter > 2: # To skip header rows
            row_list=[td.text for td in row.xpath(".//td")]
            row_dict[counter] = row_list

    df=pd.DataFrame.from_dict(row_dict).transpose()
    df.columns = ['position','game','blank','console','publisher','developer','vgchart_score',\
                 'critic_score','user_score','total_shipped','total_sales',\
                  'na_sales','pal_sales','japan_sales','other_sales',\
                  'release_date','last_update']

    # Correct the console and game columns using scraped values

    df=df.reset_index().drop(columns = ['index','blank'])
    df['console'] = consoles
    df['game'] = names_list
    return df

    # We can 'hack' the URL to display any number of results per page. I'll leave it as an argument.
def scrape_all_vg_chartz_videogame_db(results_per_page):
    df = pd.DataFrame()
    current_page = 1
    games_left = True
    while games_left:
        url = 'http://www.vgchartz.com/games/games.php?page=' + str(current_page) +\
        '&results=' + str(results_per_page) + '&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership\
        =Both&boxart=Both&banner=Both&showdeleted=&region=All&goty_year=&developer=&direction\
        =DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1&showothersales=1&\
        showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1&showvgchartzscore=1&\
        showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=No'
        new_df = scrape_vgchartz_videogame_db_page(url)
        df = pd.concat([df, new_df], ignore_index=True)
        #REMOVE LATER, TEST CONDIITON
      #  if current_page > 3:
       #     games_left = False
        print('Scraped page: ',current_page)
        if new_df.shape[0] < results_per_page:
            games_left = False
        current_page +=1
    print('Scraping done!')
    print('Total rows parsed = ', df.shape[0])
    return df.reset_index().drop(columns = 'index')

# Run the code to scrape! I did 10,000 rows per page to speed things up.
df=scrape_all_vg_chartz_videogame_db(10000)

# Compress and store for later!
df.to_pickle('./FullVGChartzDatabase.zip',compression = 'zip')

KeyboardInterrupt: 

In [None]:
import pandas as pd

# Load the pickle file
df = pd.read_pickle('./FullVGChartzDatabase.zip', compression='zip')

# Convert to CSV
df.to_csv('VGChartzDatabase.csv', index=False)

#605

In [None]:
import pandas as pd


In [None]:
#vg games the player values are in millions

In [None]:
# vgames=pd.read_csv("/content/video_game.csv")

In [None]:
#GAD     [0 --> 3] [not at all,  several days,  over half the days, nearly always]   (more is bad)

#If u feel nervous, anxious
#not being able to control or stop worrying
#worrying too much about other things
#trouble relaxing
#being restless
#becoming easily annoyd or irritated
#feeling awfull as somthin bad is goin to happen

#GADE  - If u checked any off these problems,
#how difficult these made it for you to do your work, take care of things at home, or get along with people



In [None]:
#SWL    [1 --> 7] [Strongly disagree, Disagree, silghtly disagree, neutral, silghtly agre, agree, strongly disagree]  (more is good)

#satisfied
#ideal
#excellent life
#wont not change my life
#gotten important things in I want in my life

In [None]:
#narc [1 - 5]   from not true - very true

In [None]:
import pandas as pd

df = pd.read_csv("/content/GamingStudy_data.csv", encoding='latin-1')
df_clean = df.dropna(subset=["Hours"])
numeric_df = df_clean.select_dtypes(include=['number'])
correlations = numeric_df.corr()["Hours"].drop("Hours").sort_values(ascending=False)
# print(correlations)

df = pd.read_csv("/content/GamingStudy_data.csv", encoding='latin-1')
df_clean = df.dropna(subset=["Narcissism"])
numeric_df = df_clean.select_dtypes(include=['number'])
correlations = numeric_df.corr()["Narcissism"].drop("Narcissism").sort_values(ascending=False)
#print(correlations)

df = pd.read_csv("/content/GamingStudy_data.csv", encoding='latin-1')
df_clean = df.dropna(subset=["streams"])
numeric_df = df_clean.select_dtypes(include=['number'])
correlations = numeric_df.corr()["streams"].drop("streams").sort_values(ascending=False)
# print(correlations)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import numpy as np

# Features for regression imputation
features = ["Hours", "SPIN_T"]

# Drop rows where target (Hours) and features are missing
df_filtered = df.dropna(subset=["streams"] + features)

# Define training data
X_train = df_filtered[features]
y_train = df_filtered["streams"]

# Choose imputation strategy based on skewness
imputation_strategy = 'mean' if X_train[features].skew().abs().max() < 1 else 'median'

# Create a pipeline with imputation and regression
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy=imputation_strategy)),
    ('regressor', LinearRegression())
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Select rows where "Hours" is missing
missing_stream = df[df["streams"].isnull()].copy()

# Check if there are missing values before predicting
if missing_stream.empty:
    print("No missing values in 'streams' column to impute.")
else:
    # Predict missing values
    df.loc[df["streams"].isnull(), "streams"] = np.round(pipeline.predict(missing_stream[features]))
    print("Missing 'streams' values imputed and updated in df.")
    #print(df.loc[missing_stream.index, ["streams"]])


Missing 'streams' values imputed and updated in df.


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import numpy as np

# Features for regression imputation
features = ["GAD6", "GAD_T","GAD5"]

# Drop rows where target (Hours) and features are missing
df_filtered = df.dropna(subset=["Narcissism"] + features)

# Define training data
X_train = df_filtered[features]
y_train = df_filtered["Narcissism"]

# Choose imputation strategy based on skewness
imputation_strategy = 'mean' if X_train[features].skew().abs().max() < 1 else 'median'

# Create a pipeline with imputation and regression
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy=imputation_strategy)),
    ('regressor', LinearRegression())
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Select rows where "Hours" is missing
missing_narc = df[df["Narcissism"].isnull()].copy()

# Check if there are missing values before predicting
if missing_narc.empty:
    print("No missing values in 'Narcissism' column to impute.")
else:
    # Predict missing values
    df.loc[df["Narcissism"].isnull(), "Narcissism"] = np.round(pipeline.predict(missing_narc[features]))
    print("Missing 'Narcissism' values imputed and updated in df.")
    #print(df.loc[missing_narc.index, ["Narcissism"]])


Missing 'Narcissism' values imputed and updated in df.


In [None]:
# Features for regression imputation
features = ["streams", "SPIN_T", "SPIN13", "SPIN16", "SPIN12",
            "Narcissism", "SPIN8", "SPIN10", "SPIN3", "SPIN14"]

# Drop rows where target (Hours) and features are missing
df_filtered = df.dropna(subset=["Hours"] + features)

# Define training data
X_train = df_filtered[features]
y_train = df_filtered["Hours"]

# Choose imputation strategy based on skewness
imputation_strategy = 'mean' if X_train[features].skew().abs().max() < 1 else 'median'

# Create a pipeline with imputation and regression
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy=imputation_strategy)),
    ('regressor', LinearRegression())
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Select rows where "Hours" is missing
missing_hours = df[df["Hours"].isnull()].copy()

# Check if there are missing values before predicting
if missing_hours.empty:
    print("No missing values in 'Hours' column to impute.")
else:
    # Predict missing values
    df.loc[df["Hours"].isnull(), "Hours"] = pipeline.predict(missing_hours[features])
    print("Missing 'Hours' values imputed and updated in df.")
    #print(df.loc[missing_hours.index, ["Hours"]])

Missing 'Hours' values imputed and updated in df.


In [None]:
s_drop = ['Unnamed: 0', 'Zeitstempel','Birthplace_ISO3','Residence_ISO3', 'highestleague',
          'GAD1','GAD2','GAD3','GAD4','GAD5','GAD6','GAD7','SWL1','SWL2','SWL3','SWL4','SWL5',
          'SPIN1','SPIN2','SPIN3','SPIN4','SPIN5','SPIN6','SPIN7','SPIN8','SPIN9','SPIN10','SPIN11',
          'SPIN12','SPIN13','SPIN14','SPIN15','SPIN16','SPIN17','SPIN_T','accept']
sgames = df.drop(columns=s_drop)



In [None]:
sgames['whyplay'].unique()

array(['having fun', 'improving', 'relaxing', 'winning',
       'improving, having fun', 'All',
       "I play it as I watch TV or movies.  I've gone through many a Netflix binge with Isaac.",
       'Reaching goal i.e. GM ',
       'Improving AND having fun (kinda wish I could pick more than one)',
       'all of the above', 'passing the time', 'having fun and improving',
       'Forgetting troubles', 'All of them', 'Being with friends',
       'getting good loot', 'have fun and win',
       'Winning 55% improving 45%', 'Wasting time',
       'talking to irl friends', 'Forgetting about my drug addiction',
       'Having fun and improving at the same time',
       'Improving and having fun.', 'improving while having fun',
       'All of the above!', 'socializing', 'all above', 'playing well',
       'winning + having fun', 'having a distraction',
       'the three last: improving, relaxing and having fun',
       'winning and improving',
       'depends, wwinning in rankeds, fun and re

In [None]:
!pip install streamlit
!pip install streamlit pyngrok
!pip install pyngrok

Collecting streamlit
  Downloading streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.42.0-py2.py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m72.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m91.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[

#steramtest

In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
# Title for your app
st.title("Video Game Sales Analysis")

# Load your dataset
# For demonstration, let's create a small DataFrame similar to your dataset.
data = {
    "Name": ["Wii Sports", "Super Mario Bros.", "Mario Kart Wii", "Wii Sports Resort"],
    "Year_of_Release": [2006, 1985, 2008, 2009],
    "Global_players": [82.53, 40.24, 35.52, 32.77],
    "Genre": ["Sports", "Platform", "Racing", "Sports"]
}

df = pd.DataFrame(data)

# Display the DataFrame in your app
st.subheader("Dataset Preview")
st.write(df)

# Create an interactive Plotly scatter plot
fig = px.scatter(
    df,
    x="Year_of_Release",
    y="Global_players",
    size="Global_players",
    color="Genre",
    hover_name="Name",
    title="Global Players vs. Year of Release"
)

# Display the plot
st.plotly_chart(fig)

# Add some interactivity: a slider to filter by year
year_range = st.slider(
    "Select Year Range",
    min_value=int(df["Year_of_Release"].min()),
    max_value=int(df["Year_of_Release"].max()),
    value=(int(df["Year_of_Release"].min()), int(df["Year_of_Release"].max()))
)

# Filter data based on the slider
filtered_df = df[(df["Year_of_Release"] >= year_range[0]) & (df["Year_of_Release"] <= year_range[1])]

# Update plot based on filtered data
st.subheader("Filtered Data")
st.write(filtered_df)

fig_filtered = px.scatter(
    filtered_df,
    x="Year_of_Release",
    y="Global_players",
    size="Global_players",
    color="Genre",
    hover_name="Name",
    title=f"Global Players vs. Year of Release (Years {year_range[0]} to {year_range[1]})"
)
st.plotly_chart(fig_filtered)


In [None]:
from pyngrok import ngrok
ngrok.kill()

In [None]:
ngrok.set_auth_token("2Z2OwWheOVA9BCe2FBstfdc9NTt_3FnAyeXY2cqFZ3x54WeAv")

In [None]:
!nohup streamlit run app.py &

In [None]:
public_url = ngrok.connect(8501)
print(public_url)

# games_pcy

In [2]:
%%writefile app.py
import numpy as np
import pandas as pd
import streamlit as st
import plotly.express as px
from sklearn.impute import KNNImputer
import nltk
import re
from nltk.stem import WordNetLemmatizer
from scipy.stats import chi2_contingency, f_oneway
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score


# Cache data loading for better performance
@st.cache_data
def load_data():
    return pd.read_csv("/content/GamingStudy_data.csv", encoding='latin-1')

nltk.download('wordnet')
nltk.download('omw-1.4')

# Enhanced categorization with priority system
categories = {
    'Distraction': ['distract', 'escape', 'forget', 'stress', 'anxiety', 'avoid',
                    'reality', 'pain', 'trouble', 'problem', 'depression', 'nervous'],
    'Habit/Time Pass': ['habit', 'time', 'pass', 'bored', 'routine', 'kill time',
                        'occupied', 'waste', 'fill', 'nothing', 'procrastinate'],
    'Social': ['friend', 'team', 'coop', 'multiplayer', 'social', 'together',
               'community', 'connect', 'bond', 'relationship', 'with others', 'family'],
    'Compete/Win': ['win', 'compete', 'victory', 'rank', 'ladder', 'gm', 'climb',
                    'top', 'leaderboard', 'dominate', 'triumph', 'beat', 'champion'],
    'Improve/Skill': ['improve', 'learn', 'skill', 'progress', 'master', 'practice',
                      'better', 'develop', 'growth', 'hone', 'enhance', 'advance'],
    'Fun/Relax': ['fun', 'relax', 'enjoy', 'chill', 'unwind', 'distress', 'joy',
                  'pleasure', 'entertain', 'distraction', 'happiness']
}
category_priority = ['Distraction', 'Habit/Time Pass', 'Social',
                    'Compete/Win', 'Improve/Skill', 'Fun/Relax']

def categorize_whyplay(text):
    text = re.sub(r'[^\w\s]', '', str(text).lower())
    tokens = text.split()
    verb_tokens = [WordNetLemmatizer().lemmatize(word, pos='v') for word in tokens]
    combined_tokens = set(tokens) | set(verb_tokens)

    matched = []
    for category in category_priority:
        keywords = categories[category]
        for keyword in keywords:
            if any(keyword in token for token in combined_tokens):
                matched.append(category)
                break  # Move to next category after first match

    if 'all' in combined_tokens or 'every' in combined_tokens:
        return category_priority
    return matched if matched else ['Other']

# Load and process data
df = load_data()
df['whyplay_cats'] = df['whyplay'].apply(categorize_whyplay)
df_exploded = df.explode('whyplay_cats')

st.title("Data Story on Gaming 🎮")

# Streamlit app

tab0, tab1, tab2, tab3, tab4 = st.tabs([
    "📝 Introduction",
    "🌟 Reason vs Gender",
    "🌍 Addiction and Satisfaction",
    "📊 Math behind Games",
    "🎮 Game Clusters"
])

with tab0:
    st.subheader("A DATA 605 Project by Lalith and Ojas")
    st.subheader("Why Gaming ??!!")
    st.markdown("""
    Did you know that the biggest industry in terms of revenue isn’t movies or music? Surprisingly,
    it’s the video gaming industry, which generated a staggering 💲187 billion in 2023. In comparison,
    the movie industry brought in 💲133 billion, while the music industry trailed far behind at 💲28 billion.
    That means video games earn more than both industries combined.With such immense growth and potential,
    it's clear that the video game industry is a gold mine worth exploring.
    """)

    st.markdown("""

    1. Gaming Industry
-	2022: $184 billion (Newzoo)
 Driven by mobile, console, and PC gaming, with strong growth in emerging markets.
-	2023: $187 billion (estimated)
	Moderate growth due to post-pandemic normalization and mobile dominance.
-	2024: $200 billion (projected)
	Expected growth from cloud gaming, esports, and next-gen hardware.
________________________________________
2. Music Industry
-	2022: $26.2 billion (IFPI)
	Streaming accounted for 67% of recorded music revenue.
-	2023: $28–29 billion (estimated)
	Continued growth in streaming and live events post-pandemic.
-	2024: $31–33 billion (projected)
	Expansion in emerging markets and vinyl/cassette nostalgia trends.
________________________________________
3. Movie Industry
-	2022:	Theatrical: \$26 billion (Global Box Office)
	Total (including streaming): $90 billion (MPA estimates).
-	2023:	Theatrical: \$33 billion (Box Office Mojo)
	Total: $100 billion (streaming platforms like Netflix and Disney+).
-	2024: $105–110 billion (projected)
	Hybrid releases (theatrical + streaming) and international markets driving growth.
________________________________________
4. Sports Industry
-	2022: $487 billion (Statista)
	Includes media rights, sponsorships, merchandise, and live events.
-	2023: $500–520 billion (estimated)
	Recovery in live attendance and rising media deals (e.g., NFL, Premier League).
-	2024: $550–600 billion (projected)
	Growth in digital streaming rights and global events (e.g., Olympics, FIFA World Cup)

    """)

    st.subheader("Info about dataset")

    st.markdown("""
    GAD     [0 --> 3]   [not at all,  several days,  over half the days, nearly always]

- If u feel nervous, anxious
- not being able to control or stop worrying
- worrying too much about other things
- trouble relaxing
- being restless
- becoming easily annoyd or irritated
- feeling awfull as somthin bad is goin to happen
""")

    st.markdown("<span style='color: red; font-weight: bold;'>More is Bad!</span>", unsafe_allow_html=True)

    st.markdown("""

SWL    [1 --> 7] [Strongly disagree, Disagree, silghtly disagree, neutral, silghtly agre, agree, strongly disagree]

- satisfied
- ideal
- excellent life
- wont not change my life
- gotten important things in I want in my life
 """)

    st.markdown("<span style='color: green; font-weight: bold;'>More is Good!</span>", unsafe_allow_html=True)


with tab1:
    st.header("Player Motivation Analysis")

    # Create filter buttons in columns
    st.subheader("Filter Options")
    filter_col1, filter_col2, filter_col3 = st.columns(3)

    # Gender Filter
    with filter_col1:
        gender_options = ["All"] + df['Gender'].dropna().unique().tolist()
        selected_gender = st.radio(
            "Gender",
            gender_options,
            index=0,
            key="gender_filter"
        )

    # Apply filters
    filtered_exploded = df_exploded.copy()

    if selected_gender != "All":
        filtered_exploded = filtered_exploded[filtered_exploded['Gender'] == selected_gender]

    # Create visualization with filtered data
    fig = px.bar(filtered_exploded['whyplay_cats'].value_counts().reset_index(),
                x='count', y='whyplay_cats', orientation='h',
                title=f"Gaming Motivations Breakdown",
                labels={'whyplay_cats': 'Motivation Category', 'count': 'Player Count'},
                color='whyplay_cats',
                color_discrete_sequence=px.colors.qualitative.Pastel)

    # Dynamic subtitle with active filters
    filter_text = []
    if selected_gender != "All": filter_text.append(f"Gender: {selected_gender}")


    if filter_text:
        st.caption(f"Active filters: {', '.join(filter_text)}")

    fig.update_layout(
        height=500,
        width=800,
        yaxis_title="Motivation Category",
        xaxis_title="Number of Players",
        legend_title="Motivation",
        showlegend=False  # Cleaner look for horizontal bars
    )

    st.plotly_chart(fig)
    # Chi-square test for association between Gender and whyplay_cats
    st.header("Statistical Analysis")
    contingency_table = pd.crosstab(df_exploded['Gender'], df_exploded['whyplay_cats'])
    chi2, p, _, _ = chi2_contingency(contingency_table)

    st.subheader("Chi-Square Test Results")
    st.markdown(f"""
    - **Chi-Square Statistic**: `{chi2:.2f}`
    - **P-value**: `{p:.5f}`
    - **Significance**: {'✅ Significant' if p < 0.05 else '❌ Not Significant'}
    """)
    st.subheader("Key Insights")
    st.markdown("""
    - Males tend to report more competitive motivations ("Compete/Win")
    - Female players emphasize "Fun/Relax" and "Improve/Skill" aspects
    - "Improve/Skill" is common across all genders
    - Significant association between gender and gaming motivation (p < 0.05)
    """)

    st.subheader("Interpretation:")
    st.markdown("""
    There is a statistically significant association between gender and gaming motivations. This means:
    - Males and females have different motivational patterns when gaming.
    - The observed differences (e.g., males emphasizing "Compete/Win," females prioritizing "Fun/Relax") are unlikely due to random chance (p < 0.05).
    """)


# ------------------------------------------------------------------------------

# Save a copy of the original dataset for further visualization and correlation (before imputation)
df_original = df.copy()



# 1️⃣ **IMPUTE MISSING VALUES (Hours)**
features_hours = ["streams", "SPIN_T", "SPIN13", "SPIN16", "SPIN12",
                  "Narcissism", "SPIN8", "SPIN10", "SPIN3", "SPIN14"]
df_filtered = df.dropna(subset=["Hours"] + features_hours)
X_train = df_filtered[features_hours]
y_train = df_filtered["Hours"]
imputer = KNNImputer(n_neighbors=2)
df[features_hours] = imputer.fit_transform(df[features_hours])
df = df[~df["Hours"].isin([420, 8000])]

# 2️⃣ **IMPUTE MISSING VALUES (Narcissism)**
features_narc = ["GAD6", "GAD_T", "GAD5"]
df_filtered = df.dropna(subset=["Narcissism"] + features_narc)
X_train = df_filtered[features_narc]
y_train = df_filtered["Narcissism"]
imputer = KNNImputer(n_neighbors=2)
df[features_narc] = imputer.fit_transform(df[features_narc])

# 3️⃣ **IMPUTE MISSING VALUES (Streams)**
features_streams = ["Hours", "SPIN_T"]
df_filtered = df.dropna(subset=["streams"] + features_streams)
X_train = df_filtered[features_streams]
y_train = df_filtered["streams"]
imputer = KNNImputer(n_neighbors=2)
df[features_streams] = imputer.fit_transform(df[features_streams])

# 4️⃣ **REMOVE UNNECESSARY COLUMNS**
s_drop = ['Unnamed: 0', 'Zeitstempel', 'Birthplace_ISO3', 'Residence_ISO3', 'highestleague',
          'GAD1', 'GAD2', 'GAD3', 'GAD4', 'GAD5', 'GAD6', 'GAD7',
          'SWL1', 'SWL2', 'SWL3', 'SWL4', 'SWL5',
          'SPIN1', 'SPIN2', 'SPIN3', 'SPIN4', 'SPIN5', 'SPIN6', 'SPIN7', 'SPIN8',
          'SPIN9', 'SPIN10', 'SPIN11', 'SPIN12', 'SPIN13', 'SPIN14', 'SPIN15',
          'SPIN16', 'SPIN17', 'SPIN_T', 'accept']
df = df.drop(columns=s_drop, errors='ignore')


with tab2:

    st.header("Gaming Hours vs Anxiety & Satisfaction with Life")

    # Improved filter presentation within Tab 2:
    st.subheader("Filter Options")
    with st.container():
        col1, col2 = st.columns(2)
        with col1:
            gender_filter = st.selectbox("Select Gender", ["All"] + df["Gender"].dropna().unique().tolist(), key="gender_filter_tab2")
            work_type = st.selectbox("Select Work Type", ["All"] + df["Work"].dropna().unique().tolist(), key="work_type_tab2")
        with col2:
            degree = st.selectbox("Select Degree", ["All"] + df["Degree"].dropna().unique().tolist(), key="degree_tab2")
            residence = st.selectbox("Select Residence", ["All"] + df["Residence"].dropna().unique().tolist(), key="residence_tab2")
            game_type = st.selectbox("Select Game Type", ["All"] + df["Game"].dropna().unique().tolist(), key="game_type_tab2")

    # Apply filters based on the selections
    filtered_df = df.copy()
    if gender_filter != "All":
        filtered_df = filtered_df[filtered_df["Gender"] == gender_filter]
    if work_type != "All":
        filtered_df = filtered_df[filtered_df["Work"] == work_type]
    if degree != "All":
        filtered_df = filtered_df[filtered_df["Degree"] == degree]
    if residence != "All":
        filtered_df = filtered_df[filtered_df["Residence"] == residence]
    if game_type != "All":
        filtered_df = filtered_df[filtered_df["Game"] == game_type]

    # Gaming Hours vs Anxiety (GAD_T)
    if "Hours" in filtered_df.columns and "GAD_T" in filtered_df.columns:
        fig1 = px.scatter(filtered_df, x="Hours", y="GAD_T", trendline="ols",
                          title="Gaming Hours vs Anxiety (GAD_T)",
                          color_discrete_sequence=['#FF6347'])
        fig1.update_traces(line=dict(color='green'))
        st.plotly_chart(fig1)

    # Gaming Hours vs Satisfaction (SWL_T)
    if "Hours" in filtered_df.columns and "SWL_T" in filtered_df.columns:
        fig2 = px.scatter(filtered_df, x="Hours", y="SWL_T", trendline="ols",
                          title="Gaming Hours vs Satisfaction with Life",
                          color_discrete_sequence=['#1E90FF'])
        fig2.update_traces(line=dict(color='red'))
        st.plotly_chart(fig2)



import plotly.express as px
import scipy.stats as stats


with tab3:

    # Full Correlation Matrix (using non-imputed original data)
    st.title("🔍 Full Correlation Matrix ")
    numeric_df_original = df_original.drop(columns=s_drop, errors='ignore').select_dtypes(include=['number'])
    correlation_matrix_original = numeric_df_original.corr()
    st.subheader("📊 Correlation Table ")
    st.dataframe(correlation_matrix_original.style.format("{:.4f}").background_gradient(cmap='coolwarm'))

    # Box plot comparing SWL_T across Work categories
    fig = px.box(df, x='Work', y='SWL_T',
                title="Life Satisfaction (SWL_T) by Employment Status",
                labels={"Work": "Employment Status", "SWL_T": "Life Satisfaction (SWL_T)"})
    st.plotly_chart(fig)

    # Prepare data for ANOVA: group SWL_T values by Work category
    groups = [group['SWL_T'].dropna() for name, group in df.groupby('Work')]

    # Perform one-way ANOVA
    F, p = stats.f_oneway(*groups)

    st.write("### ANOVA Results")
    st.write(f"F-statistic = {F:.2f}, p-value = {p:.2f}")

    st.subheader("Interpretation:")
    st.markdown("""
    Life satisfaction (SWL_T) varies significantly across employment statuses:

    - Large F-value (274.84) indicates strong group differences.

    - Employed individuals (Mean SWL_T = 20.7) report higher life satisfaction than unemployed (14.7).

    - Practical implication: Unemployment may exacerbate mental health challenges in gamers, while employment correlates with better well-being.
    """)

    # Calculate and display mean SWL_T per employment status
    mean_values = df.groupby('Work')['SWL_T'].mean().round(1)
    st.write("### Mean Life Satisfaction (SWL_T) by Employment Status")
    st.write(mean_values)

    # Insight interpretation (example based on provided values)
    st.markdown("""
    **Insight:** Employed individuals reported higher life satisfaction (Mean = 20.7) compared to unemployed individuals (Mean = 14.7).
    The ANOVA indicates that these differences are statistically significant (F = 274.84, p = 0.00).
    """)

# -------------------------------------------------
# Gamers Clustering Based on Habits and Mental Health

with tab4:
    st.title("🔍 Gamers Clustering Based on Habits and Mental Health")

    from sklearn.preprocessing import StandardScaler
    from sklearn.cluster import KMeans
    from sklearn.decomposition import PCA

    cluster_vars = ['Hours', 'GAD_T', 'SWL_T']

    clustering_data = df[cluster_vars].dropna()

    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(clustering_data)

    kmeans = KMeans(n_clusters=2, random_state=42)
    clusters = kmeans.fit_predict(scaled_data)
    clustering_data['Cluster'] = clusters

    pca = PCA(n_components=2, random_state=42)
    pca_components = pca.fit_transform(scaled_data)
    clustering_data['PC1'] = pca_components[:, 0]
    clustering_data['PC2'] = pca_components[:, 1]

    fig_cluster = px.scatter(clustering_data, x='PC1', y='PC2', color='Cluster',
                            title="Gamers Clusters (PCA-Reduced)",
                            labels={"PC1": "Principal Component 1", "PC2": "Principal Component 2"},
                            color_discrete_sequence=px.colors.qualitative.Bold)
    st.plotly_chart(fig_cluster)

    cluster_profiles = clustering_data.groupby('Cluster')[cluster_vars].mean().round(1)
    st.write("### Cluster Profiles")
    st.dataframe(cluster_profiles)

    st.markdown("""
    Real-World Analogy:
    - Cluster 0 (Balanced Players)

      Gaming Hours: Moderate (18.9 hours)

      Anxiety (GAD_T): Low (2.8)

      Life Satisfaction (SWL_T): High (23.7)

      Profile: Players with healthy gaming habits who maintain good mental health and life satisfaction. Likely play for enjoyment/skill development without excessive time commitment.

    - Cluster 1 (At-Risk Players)

      Gaming Hours: High (26 hours)

      Anxiety (GAD_T): Elevated (9.1)

      Life Satisfaction (SWL_T): Low (13.4)

      Profile: Players showing potential signs of problematic gaming behavior - longer playtimes correlate with higher anxiety and reduced life satisfaction. May be using gaming as an escape mechanism.
    """)

    st.subheader(" Key Psychological Insight:")
    st.markdown("""

    The pattern shows an inverse relationship between gaming hours and mental health metrics:

    ↑ More gaming hours = ↑ Anxiety + ↓ Life satisfaction

    ↓ Moderate gaming = ↓ Anxiety + ↑ Life satisfaction

    This aligns with clinical observations that excessive gaming can be both a symptom and contributor to mental health challenges.

    """)



Writing app.py


In [3]:
from pyngrok import ngrok
ngrok.kill()

In [4]:
ngrok.set_auth_token("2Z2OwWheOVA9BCe2FBstfdc9NTt_3FnAyeXY2cqFZ3x54WeAv")



In [5]:
!nohup streamlit run app.py &

nohup: appending output to 'nohup.out'


In [6]:
public_url = ngrok.connect(8501)
print(public_url)

NgrokTunnel: "https://6035-34-48-96-142.ngrok-free.app" -> "http://localhost:8501"


#games_sales

In [28]:
%%writefile app.py
import streamlit as st
import pandas as pd
import plotly.express as px
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np


st.set_page_config(page_title="Game Sales Dashboard", page_icon="🎮", layout="wide")


st.markdown("""
<style>
@keyframes fadeIn {
  from {opacity: 0;}
  to {opacity: 1;}
}
.fade-in {
  animation: fadeIn 1s ease-in;
}
</style>
""", unsafe_allow_html=True)

@st.cache_data
def load_data():
    df = pd.read_csv("Video_Games_Sales_as_at_22_Dec_2016.csv")
    df["User_Score"] = pd.to_numeric(df["User_Score"], errors='coerce')
    median_year = int(df["Year_of_Release"].median())
    df["Year_of_Release"].fillna(median_year, inplace=True)


    numeric_columns = ["Critic_Score", "Critic_Count", "User_Score", "User_Count"]
    imputer = KNNImputer(n_neighbors=5)
    df[numeric_columns] = imputer.fit_transform(df[numeric_columns])

    return df

df = load_data()

st.sidebar.header("🔍 Filters")
selected_year_range = st.sidebar.slider(
    "Select Year Range",
    min_value=int(df['Year_of_Release'].min()),
    max_value=int(df['Year_of_Release'].max()),
    value=(2000, 2016)
)

selected_genres = st.sidebar.multiselect(
    "Select Genres",
    options=df['Genre'].unique(),
    default=['Action', 'Sports', 'Shooter']
)


st.title("🎮 Video Game Sales Dashboard")
st.markdown('<div class="fade-in">', unsafe_allow_html=True)

filtered_df = df[
    (df['Year_of_Release'].between(*selected_year_range)) &
    (df['Genre'].isin(selected_genres))
]


col1, col2, col3 = st.columns(3)
with col1:
    st.metric("Total Games Analyzed", len(filtered_df))
with col2:
    st.metric("Total Global Sales", f"${filtered_df['Global_Sales'].sum():,.1f}B")
with col3:
    st.metric("Average Critic Score", f"{filtered_df['Critic_Score'].mean():.1f}/100")


tab1, tab2, tab3, tab4, tab5 = st.tabs([
    "📊 Sales Analysis",
    "🌟 Score Correlations",
    "🌍 Regional Insights",
    "🎮 Game Clusters",
    "📝 References"
])

with tab1:
    st.header("Sales Performance Analysis")


    num_games = st.slider("Select Number of Top Games", 5, 20, 10)
    top_games = filtered_df.nlargest(num_games, 'Global_Sales')

    fig = px.bar(top_games,
                 x='Global_Sales',
                 y='Name',
                 orientation='h',
                 color='Platform',
                 title=f"Top {num_games} Best-Selling Games",
                 labels={'Global_Sales': 'Global Sales (Millions)'})
    st.plotly_chart(fig, use_container_width=True)


    st.subheader("Sales Trends Over Time")
    trend_data = filtered_df.groupby("Year_of_Release")["Global_Sales"].sum().reset_index()
    fig = px.area(trend_data,
                  x='Year_of_Release',
                  y='Global_Sales',
                  markers=True,
                  title="Global Sales Trend Over Time")
    st.plotly_chart(fig, use_container_width=True)

with tab2:
    st.header("Review Score Analysis")


    score_type = st.radio("Select Score Type", ['Critic_Score', 'User_Score'])

    fig = px.scatter(filtered_df,
                     x=score_type,
                     y='Global_Sales',
                     color='Genre',
                     size='Critic_Count',
                     hover_name='Name',
                     title=f"{score_type.replace('_', ' ')} vs Global Sales")
    st.plotly_chart(fig, use_container_width=True)

    st.subheader("Regional Sales Correlations")
    corr_matrix = filtered_df[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].corr()
    fig = px.imshow(corr_matrix,
                    text_auto=True,
                    color_continuous_scale='Blues',
                    title="Regional Sales Correlation Heatmap")
    st.plotly_chart(fig, use_container_width=True)

with tab3:
    st.header("Regional Market Analysis")


    region = st.selectbox("Select Region", ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'])
    regional_data = filtered_df.groupby('Genre')[region].sum().reset_index()

    fig = px.pie(regional_data,
                 names='Genre',
                 values=region,
                 title=f"{region.replace('_', ' ')} Distribution by Genre")
    st.plotly_chart(fig, use_container_width=True)


    st.subheader("Top Publishers by Region")
    publishers = filtered_df.groupby('Publisher')[['NA_Sales', 'EU_Sales', 'JP_Sales']].sum()
    fig = px.bar(publishers.nlargest(5, 'NA_Sales'),
                 orientation='h',
                 title="Top Publishers in North America")
    st.plotly_chart(fig, use_container_width=True)

with tab4:
    st.header("Game Clustering Analysis")


    n_clusters = st.slider("Select Number of Clusters", 2, 5, 3)

    cluster_df = filtered_df[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].dropna()
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(cluster_df)


    kmeans = KMeans(n_clusters=n_clusters, n_init=10)
    cluster_df['Cluster'] = kmeans.fit_predict(scaled_data)


    pca = PCA(n_components=2)
    pca_results = pca.fit_transform(scaled_data)
    cluster_df['PC1'] = pca_results[:, 0]
    cluster_df['PC2'] = pca_results[:, 1]

    fig = px.scatter(cluster_df,
                     x='PC1',
                     y='PC2',
                     color='Cluster',
                     hover_data=['NA_Sales', 'EU_Sales'],
                     title="PCA Visualization of Game Clusters")
    st.plotly_chart(fig, use_container_width=True)

    st.subheader("Cluster Characteristics")
    profile = cluster_df.groupby('Cluster').mean()
    st.dataframe(profile.style.background_gradient(cmap='Blues'), use_container_width=True)

st.markdown('</div>', unsafe_allow_html=True)


with tab5:

    st.header("""Thank you""")
    st.header("References")
    st.markdown("""
    - The Gaming sales dataset from [Kaggle Dataset](https://www.kaggle.com/datasets/gregorut/videogamesales)

    - The Gaming Psychological dataset from Open Science Framework (https://osf.io/vnbxk/)

    """)




Overwriting app.py


#605

In [32]:
%%writefile app.py
import numpy as np
import pandas as pd
import streamlit as st
import plotly.express as px
from sklearn.impute import KNNImputer
import nltk
import re
from nltk.stem import WordNetLemmatizer
from scipy.stats import chi2_contingency, f_oneway
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Initialize session state for page navigation
if "page" not in st.session_state:
    st.session_state.page = "Home"

# Sidebar with an icon button to switch pages
if st.sidebar.button("🔄 NL <-> OJ"):
    st.session_state.page = "Dashboard" if st.session_state.page == "Home" else "Home"


if st.session_state.page == "Home":

# Cache data loading for better performance
    @st.cache_data
    def load_data():
        return pd.read_csv("/content/GamingStudy_data.csv", encoding='latin-1')

    nltk.download('wordnet')
    nltk.download('omw-1.4')

    # Enhanced categorization with priority system
    categories = {
        'Distraction': ['distract', 'escape', 'forget', 'stress', 'anxiety', 'avoid',
                        'reality', 'pain', 'trouble', 'problem', 'depression', 'nervous'],
        'Habit/Time Pass': ['habit', 'time', 'pass', 'bored', 'routine', 'kill time',
                            'occupied', 'waste', 'fill', 'nothing', 'procrastinate'],
        'Social': ['friend', 'team', 'coop', 'multiplayer', 'social', 'together',
                  'community', 'connect', 'bond', 'relationship', 'with others', 'family'],
        'Compete/Win': ['win', 'compete', 'victory', 'rank', 'ladder', 'gm', 'climb',
                        'top', 'leaderboard', 'dominate', 'triumph', 'beat', 'champion'],
        'Improve/Skill': ['improve', 'learn', 'skill', 'progress', 'master', 'practice',
                          'better', 'develop', 'growth', 'hone', 'enhance', 'advance'],
        'Fun/Relax': ['fun', 'relax', 'enjoy', 'chill', 'unwind', 'distress', 'joy',
                      'pleasure', 'entertain', 'distraction', 'happiness']
    }
    category_priority = ['Distraction', 'Habit/Time Pass', 'Social',
                        'Compete/Win', 'Improve/Skill', 'Fun/Relax']

    def categorize_whyplay(text):
        text = re.sub(r'[^\w\s]', '', str(text).lower())
        tokens = text.split()
        verb_tokens = [WordNetLemmatizer().lemmatize(word, pos='v') for word in tokens]
        combined_tokens = set(tokens) | set(verb_tokens)

        matched = []
        for category in category_priority:
            keywords = categories[category]
            for keyword in keywords:
                if any(keyword in token for token in combined_tokens):
                    matched.append(category)
                    break  # Move to next category after first match

        if 'all' in combined_tokens or 'every' in combined_tokens:
            return category_priority
        return matched if matched else ['Other']

    # Load and process data
    df = load_data()
    df['whyplay_cats'] = df['whyplay'].apply(categorize_whyplay)
    df_exploded = df.explode('whyplay_cats')

    st.title("Data Story on Gaming 🎮")
    # Streamlit app

    tab0, tab1, tab2, tab3, tab4 = st.tabs([
        "📝 Introduction",
        "🌟 Reason vs Gender",
        "🌍 Addiction and Satisfaction",
        "📊 Math behind Games",
        "🎮 Game Clusters"
    ])

    with tab0:
        st.subheader("A DATA 605 Project by Lalith and Ojas")
        st.subheader("Why Gaming ??!!")
        st.markdown("""
        Did you know that the biggest industry in terms of revenue isn’t movies or music? Surprisingly,
        it’s the video gaming industry, which generated a staggering 💲187 billion in 2023. In comparison,
        the movie industry brought in 💲133 billion, while the music industry trailed far behind at 💲28 billion.
        That means video games earn more than both industries combined.With such immense growth and potential,
        it's clear that the video game industry is a gold mine worth exploring.
        """)

        st.markdown("""

        1. Gaming Industry
    -	2022: $184 billion (Newzoo)
    Driven by mobile, console, and PC gaming, with strong growth in emerging markets.
    -	2023: $187 billion (estimated)
      Moderate growth due to post-pandemic normalization and mobile dominance.
    -	2024: $200 billion (projected)
      Expected growth from cloud gaming, esports, and next-gen hardware.
    ________________________________________
    2. Music Industry
    -	2022: $26.2 billion (IFPI)
      Streaming accounted for 67% of recorded music revenue.
    -	2023: $28–29 billion (estimated)
      Continued growth in streaming and live events post-pandemic.
    -	2024: $31–33 billion (projected)
      Expansion in emerging markets and vinyl/cassette nostalgia trends.
    ________________________________________
    3. Movie Industry
    -	2022:	Theatrical: \$26 billion (Global Box Office)
      Total (including streaming): $90 billion (MPA estimates).
    -	2023:	Theatrical: \$33 billion (Box Office Mojo)
      Total: $100 billion (streaming platforms like Netflix and Disney+).
    -	2024: $105–110 billion (projected)
      Hybrid releases (theatrical + streaming) and international markets driving growth.
    ________________________________________
    4. Sports Industry
    -	2022: $487 billion (Statista)
      Includes media rights, sponsorships, merchandise, and live events.
    -	2023: $500–520 billion (estimated)
      Recovery in live attendance and rising media deals (e.g., NFL, Premier League).
    -	2024: $550–600 billion (projected)
      Growth in digital streaming rights and global events (e.g., Olympics, FIFA World Cup)

        """)

        st.subheader("Info about dataset")

        st.markdown("""
        GAD     [0 --> 3]   [not at all,  several days,  over half the days, nearly always]

    - If u feel nervous, anxious
    - not being able to control or stop worrying
    - worrying too much about other things
    - trouble relaxing
    - being restless
    - becoming easily annoyd or irritated
    - feeling awfull as somthin bad is goin to happen
    """)

        st.markdown("<span style='color: red; font-weight: bold;'>More is Bad!</span>", unsafe_allow_html=True)

        st.markdown("""

    SWL    [1 --> 7] [Strongly disagree, Disagree, silghtly disagree, neutral, silghtly agre, agree, strongly disagree]

    - satisfied
    - ideal
    - excellent life
    - wont not change my life
    - gotten important things in I want in my life
    """)

        st.markdown("<span style='color: green; font-weight: bold;'>More is Good!</span>", unsafe_allow_html=True)


    with tab1:
        st.header("Behavioral Analysis")

        # Create filter buttons in columns

        filter_col1, filter_col2, filter_col3 = st.columns(3)

        # Gender Filter
        with filter_col1:
            gender_options = ["All"] + df['Gender'].dropna().unique().tolist()
            selected_gender = st.radio(
                "Gender",
                gender_options,
                index=0,
                key="gender_filter"
            )

        # Apply filters
        filtered_exploded = df_exploded.copy()

        if selected_gender != "All":
            filtered_exploded = filtered_exploded[filtered_exploded['Gender'] == selected_gender]

        # Create visualization with filtered data
        fig = px.bar(filtered_exploded['whyplay_cats'].value_counts().reset_index(),
                    x='count', y='whyplay_cats', orientation='h',
                    title=f"Gaming Motivations Breakdown",
                    labels={'whyplay_cats': 'Motivation Category', 'count': 'Player Count'},
                    color='whyplay_cats',
                    color_discrete_sequence=px.colors.qualitative.Pastel)

        # Dynamic subtitle with active filters
        filter_text = []
        if selected_gender != "All": filter_text.append(f"Gender: {selected_gender}")


        if filter_text:
            st.caption(f"Active filters: {', '.join(filter_text)}")

        fig.update_layout(
            height=500,
            width=800,
            yaxis_title="Motivation Category",
            xaxis_title="Number of Players",
            legend_title="Motivation",
            showlegend=False  # Cleaner look for horizontal bars
        )

        st.plotly_chart(fig)
        # Chi-square test for association between Gender and whyplay_cats
        st.header("Statistical Analysis")
        contingency_table = pd.crosstab(df_exploded['Gender'], df_exploded['whyplay_cats'])
        chi2, p, _, _ = chi2_contingency(contingency_table)

        st.subheader("Chi-Square Test Results")
        st.markdown(f"""
        - **Chi-Square Statistic**: `{chi2:.2f}`
        - **P-value**: `{p:.5f}`
        - **Significance**: {'✅ Significant' if p < 0.05 else '❌ Not Significant'}
        """)
        st.subheader("Key Insights")
        st.markdown("""
        - Males tend to report more competitive motivations ("Compete/Win")
        - Female players emphasize "Fun/Relax" and "Improve/Skill" aspects
        - "Improve/Skill" is common across all genders
        - Significant association between gender and gaming motivation (p < 0.05)
        """)

        st.subheader("Interpretation:")
        st.markdown("""
        There is a statistically significant association between gender and gaming motivations. This means:
        - Males and females have different motivational patterns when gaming.
        - The observed differences (e.g., males emphasizing "Compete/Win," females prioritizing "Fun/Relax") are unlikely due to random chance (p < 0.05).
        """)


    # ------------------------------------------------------------------------------

    # Save a copy of the original dataset for further visualization and correlation (before imputation)
    df_original = df.copy()



    # 1️⃣ **IMPUTE MISSING VALUES (Hours)**
    features_hours = ["streams", "SPIN_T", "SPIN13", "SPIN16", "SPIN12",
                      "Narcissism", "SPIN8", "SPIN10", "SPIN3", "SPIN14"]
    df_filtered = df.dropna(subset=["Hours"] + features_hours)
    X_train = df_filtered[features_hours]
    y_train = df_filtered["Hours"]
    imputer = KNNImputer(n_neighbors=2)
    df[features_hours] = imputer.fit_transform(df[features_hours])
    df = df[~df["Hours"].isin([420, 8000])]

    # 2️⃣ **IMPUTE MISSING VALUES (Narcissism)**
    features_narc = ["GAD6", "GAD_T", "GAD5"]
    df_filtered = df.dropna(subset=["Narcissism"] + features_narc)
    X_train = df_filtered[features_narc]
    y_train = df_filtered["Narcissism"]
    imputer = KNNImputer(n_neighbors=2)
    df[features_narc] = imputer.fit_transform(df[features_narc])

    # 3️⃣ **IMPUTE MISSING VALUES (Streams)**
    features_streams = ["Hours", "SPIN_T"]
    df_filtered = df.dropna(subset=["streams"] + features_streams)
    X_train = df_filtered[features_streams]
    y_train = df_filtered["streams"]
    imputer = KNNImputer(n_neighbors=2)
    df[features_streams] = imputer.fit_transform(df[features_streams])

    # 4️⃣ **REMOVE UNNECESSARY COLUMNS**
    s_drop = ['Unnamed: 0', 'Zeitstempel', 'Birthplace_ISO3', 'Residence_ISO3', 'highestleague',
              'GAD1', 'GAD2', 'GAD3', 'GAD4', 'GAD5', 'GAD6', 'GAD7',
              'SWL1', 'SWL2', 'SWL3', 'SWL4', 'SWL5',
              'SPIN1', 'SPIN2', 'SPIN3', 'SPIN4', 'SPIN5', 'SPIN6', 'SPIN7', 'SPIN8',
              'SPIN9', 'SPIN10', 'SPIN11', 'SPIN12', 'SPIN13', 'SPIN14', 'SPIN15',
              'SPIN16', 'SPIN17', 'SPIN_T', 'accept']
    df = df.drop(columns=s_drop, errors='ignore')


    with tab2:

        st.header("Gaming Hours vs Anxiety & Satisfaction with Life")

        with st.container():
            col1, col2 = st.columns(2)
            with col1:
                gender_filter = st.selectbox("Select Gender", ["All"] + df["Gender"].dropna().unique().tolist(), key="gender_filter_tab2")
                work_type = st.selectbox("Select Work Type", ["All"] + df["Work"].dropna().unique().tolist(), key="work_type_tab2")
            with col2:
                degree = st.selectbox("Select Degree", ["All"] + df["Degree"].dropna().unique().tolist(), key="degree_tab2")
                residence = st.selectbox("Select Residence", ["All"] + df["Residence"].dropna().unique().tolist(), key="residence_tab2")
                game_type = st.selectbox("Select Game Type", ["All"] + df["Game"].dropna().unique().tolist(), key="game_type_tab2")

        # Apply filters based on the selections
        filtered_df = df.copy()
        if gender_filter != "All":
            filtered_df = filtered_df[filtered_df["Gender"] == gender_filter]
        if work_type != "All":
            filtered_df = filtered_df[filtered_df["Work"] == work_type]
        if degree != "All":
            filtered_df = filtered_df[filtered_df["Degree"] == degree]
        if residence != "All":
            filtered_df = filtered_df[filtered_df["Residence"] == residence]
        if game_type != "All":
            filtered_df = filtered_df[filtered_df["Game"] == game_type]

        # Gaming Hours vs Anxiety (GAD_T)
        if "Hours" in filtered_df.columns and "GAD_T" in filtered_df.columns:
            fig1 = px.scatter(filtered_df, x="Hours", y="GAD_T", trendline="ols",
                              title="Gaming Hours vs Anxiety (GAD_T)",
                              color_discrete_sequence=['#FF6347'])
            fig1.update_traces(line=dict(color='green'))
            st.plotly_chart(fig1)

        # Gaming Hours vs Satisfaction (SWL_T)
        if "Hours" in filtered_df.columns and "SWL_T" in filtered_df.columns:
            fig2 = px.scatter(filtered_df, x="Hours", y="SWL_T", trendline="ols",
                              title="Gaming Hours vs Satisfaction with Life",
                              color_discrete_sequence=['#1E90FF'])
            fig2.update_traces(line=dict(color='red'))
            st.plotly_chart(fig2)



    import plotly.express as px
    import scipy.stats as stats


    with tab3:

        # Full Correlation Matrix (using non-imputed original data)

        numeric_df_original = df_original.drop(columns=s_drop, errors='ignore').select_dtypes(include=['number'])
        correlation_matrix_original = numeric_df_original.corr()
        st.subheader("📊 Correlation Table ")
        st.dataframe(correlation_matrix_original.style.format("{:.4f}").background_gradient(cmap='coolwarm'))

        # Box plot comparing SWL_T across Work categories
        fig = px.box(df, x='Work', y='SWL_T',
                    title="Life Satisfaction (SWL_T) by Employment Status",
                    labels={"Work": "Employment Status", "SWL_T": "Life Satisfaction (SWL_T)"})
        st.plotly_chart(fig)

        # Prepare data for ANOVA: group SWL_T values by Work category
        groups = [group['SWL_T'].dropna() for name, group in df.groupby('Work')]

        # Perform one-way ANOVA
        F, p = stats.f_oneway(*groups)

        st.write("### ANOVA Results")
        st.write(f"F-statistic = {F:.2f}, p-value = {p:.2f}")

        st.subheader("Interpretation:")
        st.markdown("""
        Life satisfaction (SWL_T) varies significantly across employment statuses:

        - Large F-value (274.84) indicates strong group differences.

        - Employed individuals (Mean SWL_T = 20.7) report higher life satisfaction than unemployed (14.7).

        - Practical implication: Unemployment may exacerbate mental health challenges in gamers, while employment correlates with better well-being.
        """)

        # Calculate and display mean SWL_T per employment status
        mean_values = df.groupby('Work')['SWL_T'].mean().round(1)
        st.write("### Mean Life Satisfaction (SWL_T) by Employment Status")
        st.write(mean_values)

        # Insight interpretation (example based on provided values)
        st.markdown("""
        **Insight:** Employed individuals reported higher life satisfaction (Mean = 20.7) compared to unemployed individuals (Mean = 14.7).
        The ANOVA indicates that these differences are statistically significant (F = 274.84, p = 0.00).
        """)

    # -------------------------------------------------
    # Gamers Clustering Based on Habits and Mental Health

    with tab4:
        st.title("🔍 Gamers Clustering Based on Habits and Mental Health")

        from sklearn.preprocessing import StandardScaler
        from sklearn.cluster import KMeans
        from sklearn.decomposition import PCA

        cluster_vars = ['Hours', 'GAD_T', 'SWL_T']

        clustering_data = df[cluster_vars].dropna()

        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(clustering_data)

        kmeans = KMeans(n_clusters=2, random_state=42)
        clusters = kmeans.fit_predict(scaled_data)
        clustering_data['Cluster'] = clusters

        pca = PCA(n_components=2, random_state=42)
        pca_components = pca.fit_transform(scaled_data)
        clustering_data['PC1'] = pca_components[:, 0]
        clustering_data['PC2'] = pca_components[:, 1]

        fig_cluster = px.scatter(clustering_data, x='PC1', y='PC2', color='Cluster',
                                title="Gamers Clusters (PCA-Reduced)",
                                labels={"PC1": "Principal Component 1", "PC2": "Principal Component 2"},
                                color_discrete_sequence=px.colors.qualitative.Bold)
        st.plotly_chart(fig_cluster)

        cluster_profiles = clustering_data.groupby('Cluster')[cluster_vars].mean().round(1)
        st.write("### Cluster Profiles")
        st.dataframe(cluster_profiles)

        st.markdown("""
        Real-World Analogy:
        - Cluster 0 (Balanced Players)

          Gaming Hours: Moderate (18.9 hours)

          Anxiety (GAD_T): Low (2.8)

          Life Satisfaction (SWL_T): High (23.7)

          Profile: Players with healthy gaming habits who maintain good mental health and life satisfaction. Likely play for enjoyment/skill development without excessive time commitment.

        - Cluster 1 (At-Risk Players)

          Gaming Hours: High (26 hours)

          Anxiety (GAD_T): Elevated (9.1)

          Life Satisfaction (SWL_T): Low (13.4)

          Profile: Players showing potential signs of problematic gaming behavior - longer playtimes correlate with higher anxiety and reduced life satisfaction. May be using gaming as an escape mechanism.
        """)

        st.subheader(" Key Psychological Insight:")
        st.markdown("""

        The pattern shows an inverse relationship between gaming hours and mental health metrics:

        ↑ More gaming hours = ↑ Anxiety + ↓ Life satisfaction

        ↓ Moderate gaming = ↓ Anxiety + ↑ Life satisfaction

        This aligns with clinical observations that excessive gaming can be both a symptom and contributor to mental health challenges.

        """)

        #----------------------oj---------------------
else:



    st.markdown("""
    <style>
    @keyframes fadeIn {
      from {opacity: 0;}
      to {opacity: 1;}
    }
    .fade-in {
      animation: fadeIn 1s ease-in;
    }
    </style>
    """, unsafe_allow_html=True)

    @st.cache_data
    def load_data():
        df = pd.read_csv("Video_Games_Sales_as_at_22_Dec_2016.csv")
        df["User_Score"] = pd.to_numeric(df["User_Score"], errors='coerce')
        median_year = int(df["Year_of_Release"].median())
        df["Year_of_Release"].fillna(median_year, inplace=True)


        numeric_columns = ["Critic_Score", "Critic_Count", "User_Score", "User_Count"]
        imputer = KNNImputer(n_neighbors=5)
        df[numeric_columns] = imputer.fit_transform(df[numeric_columns])

        return df

    df = load_data()

    selected_year_range = st.sidebar.slider(
        "Select Year Range",
        min_value=int(df['Year_of_Release'].min()),
        max_value=int(df['Year_of_Release'].max()),
        value=(2000, 2016)
    )

    selected_genres = st.sidebar.multiselect(
        "Select Genres",
        options=df['Genre'].unique(),
        default=['Action', 'Sports', 'Shooter']
    )

    st.title("Data Story on Gaming 🎮")
    st.header(" 💲 Video Game Sales")
    st.markdown('<div class="fade-in">', unsafe_allow_html=True)

    filtered_df = df[
        (df['Year_of_Release'].between(*selected_year_range)) &
        (df['Genre'].isin(selected_genres))
    ]


    col1, col2, col3 = st.columns(3)
    with col1:
        st.metric("Total Games Analyzed", len(filtered_df))
    with col2:
        st.metric("Total Global Sales", f"${filtered_df['Global_Sales'].sum():,.1f}B")
    with col3:
        st.metric("Average Critic Score", f"{filtered_df['Critic_Score'].mean():.1f}/100")


    tab1, tab2, tab3, tab4, tab5 = st.tabs([
        "📊 Sales Analysis",
        "🌟 Score Correlations",
        "🌍 Regional Insights",
        "🎮 Game Clusters",
        "📝 References"
    ])

    with tab1:
        st.header("Sales Performance Analysis")


        num_games = st.slider("Select Number of Top Games", 5, 20, 10)
        top_games = filtered_df.nlargest(num_games, 'Global_Sales')

        fig = px.bar(top_games,
                    x='Global_Sales',
                    y='Name',
                    orientation='h',
                    color='Platform',
                    title=f"Top {num_games} Best-Selling Games",
                    labels={'Global_Sales': 'Global Sales (Millions)'})
        st.plotly_chart(fig, use_container_width=True)


        st.subheader("Sales Trends Over Time")
        trend_data = filtered_df.groupby("Year_of_Release")["Global_Sales"].sum().reset_index()
        fig = px.area(trend_data,
                      x='Year_of_Release',
                      y='Global_Sales',
                      markers=True,
                      title="Global Sales Trend Over Time")
        st.plotly_chart(fig, use_container_width=True)

    with tab2:
        st.header("Review Score Analysis")


        score_type = st.radio("Select Score Type", ['Critic_Score', 'User_Score'])

        fig = px.scatter(filtered_df,
                        x=score_type,
                        y='Global_Sales',
                        color='Genre',
                        size='Critic_Count',
                        hover_name='Name',
                        title=f"{score_type.replace('_', ' ')} vs Global Sales")
        st.plotly_chart(fig, use_container_width=True)

        st.subheader("Regional Sales Correlations")
        corr_matrix = filtered_df[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].corr()
        fig = px.imshow(corr_matrix,
                        text_auto=True,
                        color_continuous_scale='Blues',
                        title="Regional Sales Correlation Heatmap")
        st.plotly_chart(fig, use_container_width=True)

    with tab3:
        st.header("Regional Market Analysis")


        region = st.selectbox("Select Region", ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'])
        regional_data = filtered_df.groupby('Genre')[region].sum().reset_index()

        fig = px.pie(regional_data,
                    names='Genre',
                    values=region,
                    title=f"{region.replace('_', ' ')} Distribution by Genre")
        st.plotly_chart(fig, use_container_width=True)


        st.subheader("Top Publishers by Region")
        publishers = filtered_df.groupby('Publisher')[['NA_Sales', 'EU_Sales', 'JP_Sales']].sum()
        fig = px.bar(publishers.nlargest(5, 'NA_Sales'),
                    orientation='h',
                    title="Top Publishers in North America")
        st.plotly_chart(fig, use_container_width=True)

    with tab4:
        st.header("Game Clustering Analysis")


        n_clusters = st.slider("Select Number of Clusters", 2, 5, 3)

        cluster_df = filtered_df[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].dropna()
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(cluster_df)


        kmeans = KMeans(n_clusters=n_clusters, n_init=10)
        cluster_df['Cluster'] = kmeans.fit_predict(scaled_data)


        pca = PCA(n_components=2)
        pca_results = pca.fit_transform(scaled_data)
        cluster_df['PC1'] = pca_results[:, 0]
        cluster_df['PC2'] = pca_results[:, 1]

        fig = px.scatter(cluster_df,
                        x='PC1',
                        y='PC2',
                        color='Cluster',
                        hover_data=['NA_Sales', 'EU_Sales'],
                        title="PCA Visualization of Game Clusters")
        st.plotly_chart(fig, use_container_width=True)

        st.subheader("Cluster Characteristics")
        profile = cluster_df.groupby('Cluster').mean()
        st.dataframe(profile.style.background_gradient(cmap='Blues'), use_container_width=True)

    st.markdown('</div>', unsafe_allow_html=True)


    with tab5:

        st.header("""Thank you""")
        st.header("References")
        st.markdown("""
        - The Gaming sales dataset from [Kaggle Dataset](https://www.kaggle.com/datasets/gregorut/videogamesales)

        - The Gaming Psychological dataset from Open Science Framework (https://osf.io/vnbxk/)

        """)






Overwriting app.py


In [None]:
from pyngrok import ngrok
ngrok.kill()

In [None]:
ngrok.set_auth_token("2Z2OwWheOVA9BCe2FBstfdc9NTt_3FnAyeXY2cqFZ3x54WeAv")



In [None]:
!nohup streamlit run app.py &

nohup: appending output to 'nohup.out'


In [None]:
public_url = ngrok.connect(8501)
print(public_url)

NgrokTunnel: "https://cb74-104-198-1-110.ngrok-free.app" -> "http://localhost:8501"
