# REDDIT 
## PURPOSE : merging the Reddit and Discord datasets into a unified dataset. 

In [3]:
import pandas as pd

In [5]:
# Load the datasets
reddit_data = pd.read_csv("reddit_raw_data.csv")  
discord_data = pd.read_csv("discord_messages.csv")  

# Preview the datasets
print("Reddit Dataset:")
print(reddit_data.head())
print("\nDiscord Dataset:")
print(discord_data.head())

# Check column names for each dataset
print("\nReddit Columns:", reddit_data.columns.tolist())
print("Discord Columns:", discord_data.columns.tolist())

Reddit Dataset:
                                               Title     Content  Upvotes  \
0  Cisco Confirms Authenticity of Data After Seco...  No Content        2   
1  Mining old data from NASA’s Voyager 2 solves s...  No Content       14   
2  Israel built an ‘AI factory’ for war. It unlea...  No Content       33   
3  Developer fires entire team for AI, now ends u...  No Content      629   
4  FDA Approves First Generic of Once-Daily GLP-1...  No Content      157   

   Comments_Count                                       Top_Comments  \
0               1  ['Damn ATT, Verizon, Cisco, social security, a...   
1               3  ['« NASA’s Voyager 2 flyby of Uranus decades a...   
2               6  ['AI in war raises concerns.', 'When it comes ...   
3              68  ['Creativity and problem-solving abilities, de...   
4               9  ['This is generic victoza, which has been out ...   

               Created              Author  \
0  2024-12-30 23:15:22          lurker_bee

# Discord data Issues Handling 

In [7]:
# Check the data type of the Author column
author_column_type = discord_data['author'].apply(type).value_counts()

# Print the result
print("Data types in the 'Author' column:")
print(author_column_type)

Data types in the 'Author' column:
author
<class 'str'>    13320
Name: count, dtype: int64


In [8]:
import ast 

# Function to extract relevant fields from the 'Author' column
def extract_author_info(author_data):
    try:
        # Safely evaluate the string representation of the dictionary
        author_dict = ast.literal_eval(author_data)
        
        # Extract the relevant fields
        return {
            'Author_ID': author_dict.get('id', 'Unknown'),
            'Author_Username': author_dict.get('username', 'Unknown'),
            'Author_Discriminator': author_dict.get('discriminator', 'Unknown'),
            'Author_Global_Name': author_dict.get('global_name', 'Unknown'),
            'Author_Primary_Guild': author_dict.get('primary_guild', 'Unknown')
        }
    except (ValueError, SyntaxError):
        # Return default values in case of an error
        return {
            'Author_ID': 'Unknown',
            'Author_Username': 'Unknown',
            'Author_Discriminator': 'Unknown',
            'Author_Global_Name': 'Unknown',
            'Author_Primary_Guild': 'Unknown'
        }

# Apply the function to the Discord dataset and expand into separate columns
author_info = discord_data['author'].apply(extract_author_info).apply(pd.Series)

# Add the extracted fields to the main dataset
discord_data = pd.concat([discord_data.drop(columns=['author']), author_info], axis=1)

# Preview the cleaned Discord dataset
print(discord_data.head())

                                             content  \
0  *The mother of learning, is repetition.*\n*The...   
1                                                🤨 ?   
2  Only two things can do that. <a:FP_StarSparkle...   
3  Fair enough. Will this take me from "zero to h...   
4                           It's your time to waste.   

                          timestamp           Author_ID Author_Username  \
0  2024-12-30T17:32:53.427000+00:00  244847651013656577        adiablue   
1  2024-12-30T17:32:10.251000+00:00  345205383373258753      wwolverine   
2  2024-12-30T17:31:35.458000+00:00  244847651013656577        adiablue   
3  2024-12-30T17:30:50.323000+00:00  345205383373258753      wwolverine   
4  2024-12-30T17:30:03.453000+00:00  244847651013656577        adiablue   

  Author_Discriminator Author_Global_Name Author_Primary_Guild  
0                    0           The Blue                 None  
1                    0          Wolverine                 None  
2                

## Data Cleaning

In [10]:
# Drop the unnecessary columns
discord_data = discord_data.drop(columns=['Author_ID', 'Author_Username', 'Author_Discriminator', 'Author_Primary_Guild'])

# Preview the updated DataFrame
print(discord_data.head())

                                             content  \
0  *The mother of learning, is repetition.*\n*The...   
1                                                🤨 ?   
2  Only two things can do that. <a:FP_StarSparkle...   
3  Fair enough. Will this take me from "zero to h...   
4                           It's your time to waste.   

                          timestamp Author_Global_Name  
0  2024-12-30T17:32:53.427000+00:00           The Blue  
1  2024-12-30T17:32:10.251000+00:00          Wolverine  
2  2024-12-30T17:31:35.458000+00:00           The Blue  
3  2024-12-30T17:30:50.323000+00:00          Wolverine  
4  2024-12-30T17:30:03.453000+00:00           The Blue  


# Reddit Data Handling

In [14]:
# Drop the 'URL' column from the Reddit dataset
reddit_data = reddit_data.drop(columns=['URL'])

# Preview the updated DataFrame
print(reddit_data.head())

                                               Title     Content  Upvotes  \
0  Cisco Confirms Authenticity of Data After Seco...  No Content        2   
1  Mining old data from NASA’s Voyager 2 solves s...  No Content       14   
2  Israel built an ‘AI factory’ for war. It unlea...  No Content       33   
3  Developer fires entire team for AI, now ends u...  No Content      629   
4  FDA Approves First Generic of Once-Daily GLP-1...  No Content      157   

   Comments_Count                                       Top_Comments  \
0               1  ['Damn ATT, Verizon, Cisco, social security, a...   
1               3  ['« NASA’s Voyager 2 flyby of Uranus decades a...   
2               6  ['AI in war raises concerns.', 'When it comes ...   
3              68  ['Creativity and problem-solving abilities, de...   
4               9  ['This is generic victoza, which has been out ...   

               Created              Author  
0  2024-12-30 23:15:22          lurker_bee  
1  2024-12-30 

In [16]:
# Merge Title and Content columns into a single column
reddit_data['Title_Content'] = reddit_data['Title'] + " " + reddit_data['Content']

# Drop the original Content column
reddit_data = reddit_data.drop(columns=['Content'])

# Preview the updated DataFrame
print(reddit_data.head())

                                               Title  Upvotes  Comments_Count  \
0  Cisco Confirms Authenticity of Data After Seco...        2               1   
1  Mining old data from NASA’s Voyager 2 solves s...       14               3   
2  Israel built an ‘AI factory’ for war. It unlea...       33               6   
3  Developer fires entire team for AI, now ends u...      629              68   
4  FDA Approves First Generic of Once-Daily GLP-1...      157               9   

                                        Top_Comments              Created  \
0  ['Damn ATT, Verizon, Cisco, social security, a...  2024-12-30 23:15:22   
1  ['« NASA’s Voyager 2 flyby of Uranus decades a...  2024-12-30 21:23:56   
2  ['AI in war raises concerns.', 'When it comes ...  2024-12-30 20:23:18   
3  ['Creativity and problem-solving abilities, de...  2024-12-30 19:54:32   
4  ['This is generic victoza, which has been out ...  2024-12-30 18:41:36   

               Author                             

In [18]:
# Check for missing values
print("Missing values per column:")
print(reddit_data.isnull().sum())

# Preview the dataset to identify inconsistencies
print(reddit_data.info())
print(reddit_data.head())

Missing values per column:
Title             0
Upvotes           0
Comments_Count    0
Top_Comments      0
Created           0
Author            0
Title_Content     0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Title           649 non-null    object
 1   Upvotes         649 non-null    int64 
 2   Comments_Count  649 non-null    int64 
 3   Top_Comments    649 non-null    object
 4   Created         649 non-null    object
 5   Author          649 non-null    object
 6   Title_Content   649 non-null    object
dtypes: int64(2), object(5)
memory usage: 35.6+ KB
None
                                               Title  Upvotes  Comments_Count  \
0  Cisco Confirms Authenticity of Data After Seco...        2               1   
1  Mining old data from NASA’s Voyager 2 solves s...       14               3   
2  Israel built an ‘A

In [20]:
# Remove duplicate rows
reddit_data = reddit_data.drop_duplicates()

# Confirm no duplicates remain
print(f"Number of rows after removing duplicates: {reddit_data.shape[0]}")


Number of rows after removing duplicates: 649


In [22]:
# Check missing values
print("Missing values per column after deduplication:")
print(reddit_data.isnull().sum())

# Drop rows with missing values in essential columns
reddit_data = reddit_data.dropna(subset=['Title', 'Created', 'Upvotes', 'Comments_Count'])

# Fill missing values in other columns with defaults (if applicable)
reddit_data['Top_Comments'] = reddit_data['Top_Comments'].fillna("[]")  # Default to empty list


Missing values per column after deduplication:
Title             0
Upvotes           0
Comments_Count    0
Top_Comments      0
Created           0
Author            0
Title_Content     0
dtype: int64


In [24]:
# Add Engagement Rate column
reddit_data['Engagement_Rate'] = (reddit_data['Upvotes'] + reddit_data['Comments_Count']) / (
    1 + reddit_data.shape[0]
)

In [None]:
avg_upvotes = reddit_data['Upvotes'].mean()
avg_comments = reddit_data['Comments_Count'].mean()
print(f"Average Upvotes: {avg_upvotes}, Average Comments: {avg_comments}")
total_posts = reddit_data.shape[0]
reddit_data['Engagement_Ratio'] = (reddit_data['Upvotes'] + reddit_data['Comments_Count']) / total_posts
print(reddit_data[['Title', 'Engagement_Ratio']].head())
reddit_data['Comment_to_Upvote_Ratio'] = reddit_data['Comments_Count'] / (reddit_data['Upvotes'] + 1e-9)
print(reddit_data[['Title', 'Comment_to_Upvote_Ratio']].head())
from datetime import datetime
# Convert 'Created' to datetime
reddit_data['Created'] = pd.to_datetime(reddit_data['Created'])
reddit_data['Post_Lifetime_Days'] = (datetime.now() - reddit_data['Created']).dt.days
print(reddit_data[['Title', 'Post_Lifetime_Days']].head())


In [26]:
# Remove unnecessary whitespace in Title
reddit_data['Title'] = reddit_data['Title'].str.strip()

# Preview cleaned Top_Comments
print("Sample Top Comments:")
print(reddit_data['Top_Comments'].head())


Sample Top Comments:
0    ['Damn ATT, Verizon, Cisco, social security, a...
1    ['« NASA’s Voyager 2 flyby of Uranus decades a...
2    ['AI in war raises concerns.', 'When it comes ...
3    ['Creativity and problem-solving abilities, de...
4    ['This is generic victoza, which has been out ...
Name: Top_Comments, dtype: object


In [28]:
reddit_data.to_csv("cleaned_reddit_data.csv", index=False)
print("Cleaned dataset saved as 'cleaned_reddit_data.csv'")

Cleaned dataset saved as 'cleaned_reddit_data.csv'


In [30]:
reddit_data['Source'] = 'Reddit'
discord_data['Source'] = 'Discord'

In [32]:
reddit_data_renamed = reddit_data.rename(columns={
    'Title': 'Topic',
    'Created': 'Timestamp',
    'Top_Comments': 'Text'
})

discord_data_renamed = discord_data.rename(columns={
    'content': 'Text',
    'timestamp': 'Timestamp',
    'Author_Global_Name': 'Author'
})

In [34]:
# Add a placeholder 'Topic' column to the Discord dataset
discord_data_renamed['Topic'] = "Unknown"  # or None, based on your preference

# Columns to keep for Reddit and Discord datasets
reddit_columns_to_keep = ['Text', 'Timestamp', 'Author', 'Source', 'Topic']
discord_columns_to_keep = ['Text', 'Timestamp', 'Author', 'Source', 'Topic']

# Selecting relevant columns for both datasets
reddit_data_final = reddit_data_renamed[reddit_columns_to_keep]
discord_data_final = discord_data_renamed[discord_columns_to_keep]

# Combine both datasets
merged_data = pd.concat([reddit_data_final, discord_data_final], ignore_index=True)

# Preview the merged dataset
print(merged_data.head())

                                                Text            Timestamp  \
0  ['Damn ATT, Verizon, Cisco, social security, a...  2024-12-30 23:15:22   
1  ['« NASA’s Voyager 2 flyby of Uranus decades a...  2024-12-30 21:23:56   
2  ['AI in war raises concerns.', 'When it comes ...  2024-12-30 20:23:18   
3  ['Creativity and problem-solving abilities, de...  2024-12-30 19:54:32   
4  ['This is generic victoza, which has been out ...  2024-12-30 18:41:36   

               Author  Source  \
0          lurker_bee  Reddit   
1              fchung  Reddit   
2         MetaKnowing  Reddit   
3  No-Information6622  Reddit   
4          Peter55667  Reddit   

                                               Topic  
0  Cisco Confirms Authenticity of Data After Seco...  
1  Mining old data from NASA’s Voyager 2 solves s...  
2  Israel built an ‘AI factory’ for war. It unlea...  
3  Developer fires entire team for AI, now ends u...  
4  FDA Approves First Generic of Once-Daily GLP-1...  


In [36]:
merged_data.to_csv("merged_reddit_discord_data.csv", index=False)
print("Merged dataset saved to 'merged_reddit_discord_data.csv'")

Merged dataset saved to 'merged_reddit_discord_data.csv'
