# FORUM 8

In [None]:
import pandas as pd
from sqlalchemy import create_engine
import matplotlib.pyplot as plt

# Function to get a database connection
def get_db_connection():
    engine = create_engine('postgresql://postgres:1234@localhost:5432/june')
    return engine.connect()

# Function to get the total number of threads in the forum
def get_total_threads_in_forum(forum_id, classification_threshold):
    conn = get_db_connection()
    query = """
    SELECT COUNT(DISTINCT topic_id) AS total_threads
    FROM topics
    WHERE forum_id = %s AND classification2_topic > %s
    """
    df = pd.read_sql(query, conn, params=(forum_id, classification_threshold))
    conn.close()
    return df['total_threads'].iloc[0]

# Function to get the total number of unique users in the forum
def get_total_unique_users_in_forum(forum_id, classification_threshold):
    conn = get_db_connection()
    query = """
    SELECT COUNT(DISTINCT posts.user_id) AS total_users
    FROM posts
    INNER JOIN topics ON posts.topic_id = topics.topic_id
    WHERE topics.forum_id = %s AND topics.classification2_topic > %s
    """
    df = pd.read_sql(query, conn, params=(forum_id, classification_threshold))
    conn.close()
    return df['total_users'].iloc[0]

# Function to get the number of posts by every unique user in the forum
def get_posts_by_unique_users(forum_id, classification_threshold):
    conn = get_db_connection()
    query = """
    SELECT posts.user_id, COUNT(posts.post_id) AS post_count
    FROM posts
    INNER JOIN topics ON posts.topic_id = topics.topic_id
    WHERE topics.forum_id = %s AND topics.classification2_topic > %s
    GROUP BY posts.user_id
    """
    df = pd.read_sql(query, conn, params=(forum_id, classification_threshold))
    conn.close()
    return df

# Function to get the number of posts per thread
def get_posts_per_thread(forum_id, classification_threshold):
    conn = get_db_connection()
    query = """
    SELECT posts.topic_id, COUNT(posts.post_id) AS post_count
    FROM posts
    INNER JOIN topics ON posts.topic_id = topics.topic_id
    WHERE topics.forum_id = %s AND topics.classification2_topic > %s
    GROUP BY posts.topic_id
    """
    df = pd.read_sql(query, conn, params=(forum_id, classification_threshold))
    conn.close()
    return df

# Function to get the distribution of threads over time
def get_threads_over_time(forum_id, classification_threshold):
    conn = get_db_connection()
    query = """
    SELECT DATE_TRUNC('month', dateadded_topic) AS month, COUNT(topic_id) AS thread_count
    FROM topics
    WHERE forum_id = %s AND classification2_topic > %s
    GROUP BY month
    ORDER BY month
    """
    df = pd.read_sql(query, conn, params=(forum_id, classification_threshold))
    conn.close()
    return df

# Define the forum ID and classification threshold
forum_id = 8
classification_threshold = 0.5

# Get total number of threads
total_threads = get_total_threads_in_forum(forum_id, classification_threshold)
print(f"Total number of threads in forum {forum_id}: {total_threads}")

# Get total number of unique users
total_users = get_total_unique_users_in_forum(forum_id, classification_threshold)
print(f"Total number of unique users in forum {forum_id}: {total_users}")

# Get number of posts by every unique user
user_post_counts = get_posts_by_unique_users(forum_id, classification_threshold)
print("Number of posts by each unique user:")
print(user_post_counts.head())  # Display the first few rows

# Get number of posts per thread
thread_post_counts = get_posts_per_thread(forum_id, classification_threshold)
print("Number of posts per thread:")
print(thread_post_counts.head())  # Display the first few rows

# Calculate statistics for posts per thread
avg_posts_per_thread = thread_post_counts['post_count'].mean()
median_posts_per_thread = thread_post_counts['post_count'].median()
std_posts_per_thread = thread_post_counts['post_count'].std()

print(f"Average number of posts per thread: {avg_posts_per_thread}")
print(f"Median number of posts per thread: {median_posts_per_thread}")
print(f"Standard deviation of posts per thread: {std_posts_per_thread}")

# Get distribution of threads over time
threads_over_time = get_threads_over_time(forum_id, classification_threshold)
threads_over_time = threads_over_time.dropna()  # Remove rows with None values
print("Distribution of threads over time:")
print(threads_over_time.head())  # Display the first few rows

# Plot distribution of threads over time
plt.figure(figsize=(10, 6))
plt.plot(threads_over_time['month'], threads_over_time['thread_count'], marker='o')
plt.title(f'Distribution of Threads Over Time in Forum {forum_id}')
plt.xlabel('Month')
plt.ylabel('Number of Threads')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot user activity distribution
top_users = user_post_counts.sort_values(by='post_count', ascending=False).head(10)
plt.figure(figsize=(10, 6))
plt.bar(top_users['user_id'].astype(str), top_users['post_count'])
plt.title(f'Top 10 Most Active Users in Forum {forum_id}')
plt.xlabel('User ID')
plt.ylabel('Number of Posts')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot most active threads
top_threads = thread_post_counts.sort_values(by='post_count', ascending=False).head(10)
plt.figure(figsize=(10, 6))
plt.bar(top_threads['topic_id'].astype(str), top_threads['post_count'])
plt.title(f'Top 10 Most Active Threads in Forum {forum_id}')
plt.xlabel('Thread ID')
plt.ylabel('Number of Posts')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Save user post counts to a CSV file
user_post_counts.to_csv(f'forum_{forum_id}_user_post_counts.csv', index=False)

# Save thread post counts to a CSV file
thread_post_counts.to_csv(f'forum_{forum_id}_thread_post_counts.csv', index=False)


# Alpha_Beta_graph

In [3]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sqlalchemy import create_engine
import json

# Function to get a database connection
def get_db_connection():
    engine = create_engine('postgresql://postgres:1234@localhost:5432/june')
    return engine.connect()

# Function to get early adopters from the database
def get_early_adopters_from_db(forum_id, alpha, beta, classification_threshold, content_length_threshold, min_posts, min_threads):
    conn = get_db_connection()
    query = """
    SELECT DISTINCT posts.topic_id, posts.user_id, posts.dateadded_post, LENGTH(posts.content_post) AS content_length, topics.classification2_topic
    FROM posts
    INNER JOIN topics ON posts.topic_id = topics.topic_id
    WHERE topics.forum_id = %s AND topics.classification2_topic > %s
    """
    df = pd.read_sql(query, conn, params=(forum_id, classification_threshold))
    conn.close()

    # Convert dateadded_post to datetime and ensure UTC
    df['dateadded_post'] = pd.to_datetime(df['dateadded_post'], utc=True)

    # Filter topics with content length greater than the specified threshold
    df = df[df['content_length'] > content_length_threshold]

    # Filter users based on the minimum number of posts
    user_post_counts = df.groupby('user_id').size()
    valid_users = user_post_counts[user_post_counts >= min_posts].index
    df = df[df['user_id'].isin(valid_users)]

    # Filter threads based on the minimum number of threads by users
    user_thread_counts = df.groupby(['user_id', 'topic_id']).size().groupby('user_id').size()
    valid_users_by_threads = user_thread_counts[user_thread_counts >= min_threads].index
    df = df[df['user_id'].isin(valid_users_by_threads)]

    # Ensure unique commenters per topic
    df = df.drop_duplicates(subset=['topic_id', 'user_id'])

    # Process the dataframe to get csc, ncsc, tcsc, tncsc
    csc = {}
    ncsc = {}
    tcsc = {}
    tncsc = {}

    for topic_id, group in df.groupby('topic_id'):
        sorted_group = group.sort_values(by='dateadded_post')
        csc[topic_id] = sorted_group['user_id'].tolist()[:int(alpha)]
        ncsc[topic_id] = sorted_group['user_id'].tolist()[:int(beta)]
        tcsc[topic_id] = sorted_group['dateadded_post'].tolist()[:int(alpha)]
        tncsc[topic_id] = sorted_group['dateadded_post'].tolist()[:int(beta)]

    return csc, ncsc, tcsc, tncsc

# Function to get the total number of threads in the forum
def get_total_threads_in_forum(forum_id, classification_threshold):
    conn = get_db_connection()
    query = """
    SELECT topic_id
    FROM topics
    WHERE forum_id = %s AND classification2_topic > %s
    """
    df = pd.read_sql(query, conn, params=(forum_id, classification_threshold))
    conn.close()
    return df['topic_id'].nunique()

# Define parameters
forum_id = 8
classification_threshold = 0.5
alpha_values = [10, 20, 30, 40, 50]
beta_multipliers = [2, 3, 4, 5, 10]
content_length_thresholds = [0, 2, 10]  
min_posts_values = [1, 5, 10]  
min_threads_values = [1, 2, 5]  

# Get the total number of threads in the forum
total_threads = get_total_threads_in_forum(forum_id, classification_threshold)
print(f"Total number of threads in forum {forum_id}: {total_threads}")

# Initialize a list to store results
results = []

# Extract data for Forum 8
for content_length_threshold in content_length_thresholds:
    for min_posts in min_posts_values:
        for min_threads in min_threads_values:
            for alpha in alpha_values:
                for beta_multiplier in beta_multipliers:
                    beta = alpha * beta_multiplier
                    csc, ncsc, tcsc, tncsc = get_early_adopters_from_db(forum_id, int(alpha), int(beta), classification_threshold, content_length_threshold, min_posts, min_threads)
                    
                    # Count topics with both alpha and beta values
                    viral_topics = 0
                    times_to_alpha = []
                    times_to_beta = []
                    for key in csc.keys():
                        if len(tcsc[key]) >= alpha and len(tncsc[key]) >= beta:
                            viral_topics += 1
                            time_to_alpha = (tcsc[key][alpha-1] - tcsc[key][0]).total_seconds() / (3600 * 24)  
                            time_to_beta = (tncsc[key][beta-1] - tncsc[key][0]).total_seconds() / (3600 * 24) 
                            times_to_alpha.append(time_to_alpha)
                            times_to_beta.append(time_to_beta)
                    
                    non_viral_topics = total_threads - viral_topics
                    mean_time_to_alpha = np.mean(times_to_alpha) if times_to_alpha else 0
                    mean_time_to_beta = np.mean(times_to_beta) if times_to_beta else 0
                    
                    results.append({
                        'Content Length Threshold': content_length_threshold,
                        'Min Posts': min_posts,
                        'Min Threads': min_threads,
                        'Alpha': alpha,
                        'Beta': beta,
                        'Viral Topics': viral_topics,
                        'Non-Viral Topics': non_viral_topics,
                        'Mean Time to Reach Alpha (days)': f"{mean_time_to_alpha:.2f}",
                        'Mean Time to Reach Beta (days)': f"{mean_time_to_beta:.2f}"
                    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print("\nViral and Non-Viral Topics Analysis:")
print(results_df)

# Plotting the data for each combination of alpha and beta values
for i, row in results_df.iterrows():
    content_length_threshold = row['Content Length Threshold']
    min_posts = row['Min Posts']
    min_threads = row['Min Threads']
    alpha = int(row['Alpha'])
    beta = int(row['Beta'])
    
    csc, ncsc, tcsc, tncsc = get_early_adopters_from_db(forum_id, alpha, beta, classification_threshold, content_length_threshold, min_posts, min_threads)

    # Visualize data for Forum 8
    topic = []
    start = []
    end_alpha = []
    end_beta = []
    recent = []

    for key in csc.keys():
        if len(tcsc[key]) >= alpha and len(tncsc[key]) >= beta:
            topic.append(key)
            start.append(tcsc[key][0])
            end_alpha.append(tcsc[key][alpha-1])
            end_beta.append(tncsc[key][beta-1])
            recent.append(pd.Timestamp.now(tz='UTC'))

    if topic:
        topic_np = np.array(topic)
        start_np = np.array(start)
        end_alpha_np = np.array(end_alpha)
        end_beta_np = np.array(end_beta)
        recent_np = np.array(recent)

        start_sort = np.sort(start_np)
        end_alpha_sort = end_alpha_np[np.argsort(start_np)]
        end_beta_sort = end_beta_np[np.argsort(start_np)]
        recent_sort = recent_np[np.argsort(start_np)]
        topic_sort = topic_np[np.argsort(start_np)]

        time_to_alpha_months = (end_alpha_sort - start_sort).astype('timedelta64[D]').astype(int) / 30
        time_to_beta_months = (end_beta_sort - end_alpha_sort).astype('timedelta64[D]').astype(int) / 30
        time_to_recent_months = (recent_sort - end_beta_sort).astype('timedelta64[D]').astype(int) / 30

        fig = go.Figure()

        # Add bars for the start time, time to reach alpha, time to reach beta, and time to most recent post
        fig.add_trace(go.Bar(
            y=topic_sort,
            x=time_to_alpha_months,
            base=0,
            orientation='h',
            marker=dict(color='grey'),
            name='Time to Reach Alpha'
        ))

        fig.add_trace(go.Bar(
            y=topic_sort,
            x=time_to_beta_months,
            base=time_to_alpha_months,
            orientation='h',
            marker=dict(color='green'),
            name='Time to Reach Beta'
        ))

        fig.add_trace(go.Bar(
            y=topic_sort,
            x=time_to_recent_months,
            base=(time_to_alpha_months + time_to_beta_months),
            orientation='h',
            marker=dict(color='red'),
            name='Time to Most Recent Post'
        ))

        fig.update_layout(
            title=f'Forum {forum_id} with alpha {alpha} and beta {beta} (Content Length > {content_length_threshold}, Min Posts >= {min_posts}, Min Threads >= {min_threads})',
            xaxis_title='Months',
            yaxis_title='Topic ID',
            barmode='stack'
        )

        # Save each figure as a JSON file
        fig_json = fig.to_json()
        with open(f'forum_{forum_id}_alpha_{alpha}_beta_{beta}_content_{content_length_threshold}_posts_{min_posts}_threads_{min_threads}.json', 'w') as f:
            f.write(fig_json)
        
        print(f"Saved figure for forum {forum_id} with alpha {alpha}, beta {beta}, content length {content_length_threshold}, min posts {min_posts}, min threads {min_threads} as JSON.")

# Save results to CSV
results_df.to_csv('viral_non_viral_analysis_with_filters.csv', index=False)


Total number of threads in forum 8: 2959

Viral and Non-Viral Topics Analysis:
     Content Length Threshold  Min Posts  Min Threads  Alpha  Beta  \
0                           0          1            1     10    20   
1                           0          1            1     10    30   
2                           0          1            1     10    40   
3                           0          1            1     10    50   
4                           0          1            1     10   100   
..                        ...        ...          ...    ...   ...   
670                        10         10            5     50   100   
671                        10         10            5     50   150   
672                        10         10            5     50   200   
673                        10         10            5     50   250   
674                        10         10            5     50   500   

     Viral Topics  Non-Viral Topics Mean Time to Reach Alpha (days)  \
0        

In [9]:
import plotly.io as pio
import os
import glob

def load_and_display_json_files(directory):
    # Get all JSON files in the specified directory
    json_files = glob.glob(os.path.join(directory, '*.json'))

    for json_file in json_files:
        with open(json_file, 'r') as file:
            fig_json = file.read()
            fig = pio.from_json(fig_json)
            fig.show()
            print(f"Displayed figure from {json_file}")

# Directory where JSON files are saved
json_directory = 'path_to_your_json_directory'

# Load and display all JSON files in the directory
load_and_display_json_files(json_directory)


# Saving Network

In [5]:
import networkx as nx
import pickle as pk
from sqlalchemy import create_engine

def create_and_save_network(forum_id, path):
    user_data = extract_forum_data(forum_id)
    G = nx.DiGraph()

    for topic_id, group in user_data.groupby('topic_id'):
        users = group['user_id'].tolist()
        G.add_nodes_from(users)
        for i in range(len(users)):
            for j in range(i + 1, len(users)):
                if users[i] != users[j]:
                    if G.has_edge(users[i], users[j]):
                        G[users[i]][users[j]]['weight'] += 1
                    else:
                        G.add_edge(users[i], users[j], weight=1)

    with open(path, 'wb') as f:
        pk.dump(G, f)
    print(f'Network for forum {forum_id} saved to {path}')

forum_id = 8
network_path = 'forum_8_network.pkl'
create_and_save_network(forum_id, network_path)


Network for forum 8 saved to forum_8_network.pkl


# Feature extraction

In [7]:

import community
from joblib import Parallel, delayed
import multiprocessing


def get_net(path):
    with open(path, 'rb') as f:
        network = pk.load(f)
    return network

# Feature extraction functions
def get_f1(users):
    return np.mean([len(list(X.neighbors(usr))) for usr in users])

def get_f2(root):
    return len(list(X.neighbors(root)))

def get_f3(users):
    return np.mean([X.out_degree(usr) for usr in users])

def get_f4(pst_tm):
    elapsed = pst_tm[-1] - pst_tm[0]
    return round(elapsed.total_seconds() / 60, 2)

def get_f5(root):
    return nx.degree_centrality(X).get(root, 0)

def get_f6(root):
    return nx.out_degree_centrality(X).get(root, 0)

def get_f7(root):
    return C.get(root, 0)

def get_f8(root):
    return X.out_degree(weight='weight')[root]

def get_f9(users):
    return np.mean([X.out_degree(weight='weight')[usr] for usr in users])

def get_f12(users):
    return nx.group_out_degree_centrality(X, users)

def get_f14(users):
    return nx.group_closeness_centrality(X, users, weight='weight')

def get_f15(times):
    sum_time = dt.timedelta()
    for i in range(len(times) - 1):
        sum_time += times[i + 1] - times[i]
    return round(sum_time.total_seconds() / (60 * (len(times) - 1)), 2) if len(times) > 1 else 0

def get_communities(users):
    S = UX.subgraph(users)
    if S.number_of_edges() == 0:
        return 0, 0.0
    lp = community.best_partition(S, weight='weight', random_state=40)
    num_communities = len(set(lp.values()))
    mod = community.modularity(lp, S, weight='weight')
    return num_communities, mod

def extract_feature_set(key, users, post_times, label):
    features = [
        round(get_f1(users), 2),
        round(get_f2(users[0]), 2),
        round(get_f3(users), 2),
        get_f4(post_times),
        round(get_f5(users[0]), 2),
        round(get_f6(users[0]), 2),
        round(get_f7(users[0]), 2),
        round(get_f8(users[0]), 2),
        round(get_f9(users), 2),
        round(get_f12(users), 2),
        round(get_f14(users), 2),
        get_f15(post_times)
    ]
    f16, f17 = get_communities(users)
    features.extend([f16, f17, label])
    return features

def extract_features(csc, ncsc, tcsc, tncsc, network):
    global X, C, UX, PR
    X = network
    C = nx.eigenvector_centrality_numpy(X.reverse(), weight='weight')
    UX = X.to_undirected()
    PR = nx.pagerank(X, alpha=0.9, weight='weight')

    num_cores = multiprocessing.cpu_count()
    pos_features = Parallel(n_jobs=num_cores)(delayed(extract_feature_set)(key, users, tcsc[key], 1) for key, users in csc.items())
    neg_features = Parallel(n_jobs=num_cores)(delayed(extract_feature_set)(key, users, tncsc[key], 0) for key, users in ncsc.items())

    columns = ['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F12', 'F14', 'F15', 'F16', 'F17', 'Class']
    feature_df = pd.DataFrame(pos_features + neg_features, columns=columns)
    return feature_df

# Main function
def main():
    alpha = 10
    beta = 20
    classification_threshold = 0.5
    forum_id = 8  # Specify your forum ID
    csc, ncsc, tcsc, tncsc = get_early_adopters_from_db(forum_id, alpha, beta, classification_threshold)
    
    network_path = 'forum_8_network.pkl'
    try:
        network = get_net(network_path)
    except FileNotFoundError:
        create_and_save_network(forum_id, network_path)
        network = get_net(network_path)
    
    feature_df = extract_features(csc, ncsc, tcsc, tncsc, network)
    feature_df.to_csv(f'Forum{forum_id}_features.csv', index=False)
    
    return feature_df

# get the features dataframe
feature_df = main()


KeyboardInterrupt: 

# Model Training and Validation

# Model Evaluation on all the other forums

# Cross Validation

# Visualizations(Forum 8)

## Degree Centality

## Betweenness Centrality

## Community Detection

# Network Visualization (aplha = 30, Beta = [60, 90, 120])