In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/content/preprocessed_data_csv.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/content/preprocessed_data_csv.csv'

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

In [None]:
df.size

In [None]:
df.shape

In [None]:
cancer_type_counts = df['cancer_type'].value_counts()
print(cancer_type_counts)

In [None]:
# Plotting
plt.figure(figsize=(20, 6))
cancer_type_counts.plot(kind='bar', color='skyblue')
plt.title('Count of Cancer Types')
plt.xlabel('Cancer Type')
plt.ylabel('Count')
plt.xticks(rotation=85)
plt.show()

In [None]:
# Word cloud of content
from wordcloud import WordCloud

text = ' '.join(df['content'].astype(str).tolist())
wordcloud = WordCloud(width=800, height=400, background_color ='white').generate(text)

plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Content')
plt.axis('off')
plt.show()

In [None]:
# Check for null values in 'content' column
print(df['content'].isnull().sum())

# Drop rows with null values in 'content' column
df = df.dropna(subset=['content'])

# Check datatype of 'content' column
print(df['content'].dtype)

# Convert 'content' to string datatype
df['content'] = df['content'].astype(str)

# Distribution of content length
df['content_length'] = df['content'].apply(lambda x: len(x.split()))
plt.figure(figsize=(10, 6))
sns.histplot(df['content_length'], bins=20, kde=True, color='skyblue')
plt.title('Distribution of Content Length')
plt.xlabel('Content Length of the posts')
plt.ylabel('Frequency of the posts')
plt.show()

In [None]:
# Date-wise distribution of posts
df['date'] = pd.to_datetime(df['date'])
plt.figure(figsize=(10, 6))
df['date'].dt.date.value_counts().sort_index().plot(kind='line', color='skyblue')
plt.title('Date-wise Distribution of Posts')
plt.xlabel('Date')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

#By plotting the date-wise distribution of posts as a line graph, one can observe trends, spikes,
#or patterns in posting activity over the period covered by the dataset. This information can be useful
#for understanding user engagement, identifying popular posting times, or detecting anomalies in posting behavior.

In [None]:
# Sentiment Analysis
from textblob import TextBlob
df['sentiment'] = df['content'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Distribution of sentiment
plt.figure(figsize=(10, 6))
sns.histplot(df['sentiment'], bins=20, kde=True, color='green')
plt.title('Distribution of Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Frequency')
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='cancer_type', y='sentiment', data=df, palette='viridis')
plt.title('Sentiment Distribution by Cancer Type')
plt.xlabel('Cancer Type')
plt.ylabel('Sentiment')
plt.xticks(rotation=85)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.to_period('M')
monthly_sentiment = df.groupby('month')['sentiment'].mean()
monthly_sentiment.plot(marker='o', color='skyblue')
plt.title('Average Sentiment Over Time')
plt.xlabel('Month')
plt.ylabel('Average Sentiment')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
extreme_posts = df[(df['sentiment'] == df['sentiment'].max()) | (df['sentiment'] == df['sentiment'].min())]
print("Most Positive Post:")
print(extreme_posts[extreme_posts['sentiment'] == extreme_posts['sentiment'].max()]['content'].iloc[0])
print("\nMost Negative Post:")
print(extreme_posts[extreme_posts['sentiment'] == extreme_posts['sentiment'].min()]['content'].iloc[0])

In [None]:
from scipy.stats import ttest_ind

# Example: Compare sentiment between two cancer types
cancer_type1_sentiment = df[df['cancer_type'] == 'Cancer_Type_1']['sentiment']
cancer_type2_sentiment = df[df['cancer_type'] == 'Cancer_Type_2']['sentiment']

# Check for missing values
if cancer_type1_sentiment.isnull().any() or cancer_type2_sentiment.isnull().any():
    print("Error: Missing values detected in sentiment scores.")
else:
    # Check group sizes
    if len(cancer_type1_sentiment) < 2 or len(cancer_type2_sentiment) < 2:
        print("Error: Insufficient data for t-test.")
    else:
        # Calculate descriptive statistics
        print("Descriptive Statistics for Cancer Type 1:")
        print(cancer_type1_sentiment.describe())
        print("\nDescriptive Statistics for Cancer Type 2:")
        print(cancer_type2_sentiment.describe())

        # Plot histograms
        plt.figure(figsize=(10, 6))
        plt.hist(cancer_type1_sentiment, bins=20, alpha=0.5, label='Cancer Type 1', color='blue')
        plt.hist(cancer_type2_sentiment, bins=20, alpha=0.5, label='Cancer Type 2', color='orange')
        plt.title('Histogram of Sentiment Scores by Cancer Type')
        plt.xlabel('Sentiment Score')
        plt.ylabel('Frequency')
        plt.legend()
        plt.show()

        # Check variance
        print("\nVariance of Cancer Type 1:", cancer_type1_sentiment.var())
        print("Variance of Cancer Type 2:", cancer_type2_sentiment.var())

        # Perform t-test
        t_stat, p_value = ttest_ind(cancer_type1_sentiment, cancer_type2_sentiment, equal_var=False)
        print("\nT-statistic:", t_stat)
        print("P-value:", p_value)


In [None]:
import seaborn as sns

# Logarithmic transformation of content length
df['log_content_length'] = np.log10(df['content_length'] + 1)  # Adding 1 to avoid log(0)
plt.figure(figsize=(10, 20))
sns.scatterplot(x='log_content_length', y='sentiment', data=df, color='purple', alpha=0.5)
plt.title('Content Length vs. Sentiment')
plt.xlabel('Log(Content Length)')
plt.ylabel('Sentiment')
plt.show()


In [None]:
# Calculate correlation coefficient
correlation_coefficient = df['log_content_length'].corr(df['sentiment'])

# Print correlation coefficient
print("Correlation Coefficient:", correlation_coefficient)

# Provide actionable insights
if correlation_coefficient > 0:
    print("There is a positive correlation between content length and sentiment.")
elif correlation_coefficient < 0:
    print("There is a negative correlation between content length and sentiment.")
else:
    print("There is no significant correlation between content length and sentiment.")


In [None]:
# 1. Identify Patterns or Trends
plt.figure(figsize=(10, 6))
sns.scatterplot(x='log_content_length', y='sentiment', data=df, color='skyblue', alpha=0.5)
plt.title('Content Length vs. Sentiment')
plt.xlabel('Log(Content Length)')
plt.ylabel('Sentiment')
plt.show()

In [None]:
# 2. Analyze Correlation
correlation_coefficient = df['log_content_length'].corr(df['sentiment'])
print("Correlation Coefficient:", correlation_coefficient)


In [None]:
# 3. Segmentation Analysis
plt.figure(figsize=(12, 6))
scatter = sns.scatterplot(x='log_content_length', y='sentiment', hue='cancer_type', data=df, palette='viridis', alpha=0.5)
plt.title('Content Length vs. Sentiment (Segmented by Cancer Type)')
plt.xlabel('Log(Content Length)')
plt.ylabel('Sentiment')
plt.legend(title='Cancer Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()  # Adjust layout to prevent overlapping elements
plt.show()


In [None]:
# 4. Outlier Detection
outliers = df[(df['log_content_length'] > 2) & (df['sentiment'] > 0.5)]  # Example condition for outliers
print("Number of Outliers:", len(outliers))
print("Outlier Examples:")
print(outliers[['log_content_length', 'sentiment']].head())

In [None]:
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Topic Modeling
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
content_matrix = vectorizer.fit_transform(df['content'])
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(content_matrix)

# Print top words for each topic
feature_names = vectorizer.get_feature_names_out()
for i, topic in enumerate(lda_model.components_):
    print(f"Top words for Topic {i+1}:")
    print([feature_names[index] for index in topic.argsort()[-10:]])


#Topic 1: This topic seems to be related to experiences with surgery, pain, and medical professionals (e.g., doctors).

#Topic 2: This topic appears to be associated with medical terms related to diagnosis and treatment, such as nodes,
#biopsy, scan, tumor, and cancer.

#Topic 3: This topic might be related to personal experiences with cancer diagnosis and treatment, including interactions with
#doctors and emotional responses.

#Topic 4: This topic seems to involve aspects of life as a cancer patient, including treatment, time, and the impact on life.

#Topic 5: This topic appears to focus on specific treatment modalities like radiation and chemotherapy, as well as the experience of
#being diagnosed with cancer.

#Based on these interpretations, you can see that each topic represents a different theme or aspect related to cancer diagnosis,
#treatment, and personal experiences. These topics can provide valuable insights into the underlying structure of the corpus and help
#organize and summarize the content of the documents. Depending on your specific goals or application, you can use these topics for tasks
#such as document categorization, content recommendation, or sentiment analysis.

In [None]:
# Select the source and target columns to create edges_df
edges_df = df[['username', 'comments']].copy()

# Display the first few rows of edges_df
print(edges_df.head())


In [None]:
plt.figure(figsize=(8, 8))
df['cancer_type'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=sns.color_palette('pastel'), startangle=140)
plt.title('Cancer Type Distribution')
plt.ylabel('')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
pairplot = sns.pairplot(df[['content_length', 'sentiment']], diag_kind='kde', plot_kws={'color': 'skyblue'})
pairplot.fig.suptitle('Pairplot of Content Length and Sentiment')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='cancer_type', y='sentiment', data=df, palette='pastel')
plt.title('Average Sentiment by Cancer Type')
plt.xlabel('Cancer Type')
plt.ylabel('Average Sentiment')
plt.xticks(rotation=85)
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(x='cancer_type', y='sentiment', data=df, palette='pastel')
plt.title('Violin Plot of Sentiment Distribution by Cancer Type')
plt.xlabel('Cancer Type')
plt.ylabel('Sentiment')
plt.xticks(rotation=85)
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Group by 'username' and count the number of comments for each user
user_comment_counts = df['username'].value_counts()

# Get the top 10 users with the most comments
top_10_users = user_comment_counts.head(10)

# Create a bar plot
plt.figure(figsize=(10, 6))
top_10_users.plot(kind='bar', color='skyblue')
plt.title('Top 10 Users by Number of Comments')
plt.xlabel('Usernames')
plt.ylabel('Number of Comments')
plt.xticks(rotation=45)
plt.show()


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import matplotlib.pyplot as plt

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Combine all comments into a single string
comments = ' '.join(df['content'])

# Tokenize the comments
tokens = word_tokenize(comments)

# Define English stopwords
stop_words = set(stopwords.words('english'))

# Filter out stopwords and non-alphabetic tokens
filtered_tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]

# Calculate frequency distribution of words
freq_dist = FreqDist(filtered_tokens)

# Get the top 10 most frequent words
top_words = freq_dist.most_common(10)

# Plot the top 10 most frequent words
plt.figure(figsize=(12, 6))
plt.bar(*zip(*top_words), color='skyblue')
plt.title('Top 10 Most Frequent Words in Content (excluding stopwords)')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming 'date' column already in datetime format
df['date'] = pd.to_datetime(df['date'])

# Group by date and count the number of posts for each date
post_count_by_date = df.resample('D', on='date').size()

# Plot the number of posts over time
plt.figure(figsize=(12, 6))
post_count_by_date.plot(linewidth=2, color='skyblue')
plt.title('Number of Posts Over Time', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Number of Posts', fontsize=14)
plt.grid(True)
plt.show()


In [None]:
!pip install python-louvain

In [None]:
!pip install networkx python-louvain

In [None]:
import pandas as pd

# Assuming 'comments' column exists in your main DataFrame 'df'
# Replace 'comments' with the appropriate column name if it differs in your dataset
df_comments = df[['username', 'comments']].dropna()  # Assuming 'comments' is the column name for comments data

# Rename columns to match the expected column names in the community network code
df_comments.rename(columns={'username': 'Author', 'comments': 'Comment'}, inplace=True)

# Display the first few rows of the newly created DataFrame
print(df_comments.head())


In [None]:
import networkx as nx
import matplotlib.pyplot as plt
from networkx.algorithms import community

# Function to draw network graph with enhanced features
def draw_network_graph(G, cancer_type):
    plt.figure(figsize=(12, 8))

    # Use spring layout with adjusted parameters for better node placement
    pos = nx.spring_layout(G, k=0.3, iterations=50)

    # Check if the graph has edges
    if G.number_of_edges() > 0:
        # Detect communities within the network
        communities = community.greedy_modularity_communities(G)
        num_communities = len(communities)

        # Draw nodes, coloring nodes by community
        for i, community_nodes in enumerate(communities):
            nx.draw_networkx_nodes(G, pos, nodelist=community_nodes, node_color=plt.cm.tab10(i), node_size=200, label=f'Community {i+1}')
    else:
        # Draw nodes without community coloring
        nx.draw_networkx_nodes(G, pos, node_size=200, label='Nodes')

    # Draw edges
    nx.draw_networkx_edges(G, pos, width=0.5, alpha=0.7)

    # Draw labels for nodes with adjusted font size and position
    nx.draw_networkx_labels(G, pos, font_size=8, font_weight='bold', verticalalignment='center', horizontalalignment='left')

    plt.title(f'User Interaction Network for {cancer_type}')
    plt.legend(loc='upper left', fontsize='medium', bbox_to_anchor=(1, 1))  # Place legend outside plot area
    plt.axis('off')
    plt.tight_layout()  # Adjust layout to prevent overlapping
    plt.show()

# Iterate over unique cancer types
for cancer_type in df['cancer_type'].unique():
    # Filter dataframe for the current cancer type
    cancer_df = df[df['cancer_type'] == cancer_type]

    # Create a graph
    G = nx.Graph()

    # Add nodes for each unique username
    unique_users = cancer_df['username'].unique()
    G.add_nodes_from(unique_users)

    # Add edges between users based on interactions
    for index, row in cancer_df.iterrows():
        # Define interactions (e.g., commenting on the same post)
        interactions = [(user1, user2) for user1 in unique_users for user2 in unique_users
                        if user1 != user2 and user1 in row['comments'] and user2 in row['comments']]
        G.add_edges_from(interactions)

    # Draw the network graph with enhanced features
    draw_network_graph(G, cancer_type)


In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Step 1: Filter Data for a Specific Cancer Type
cancer_type = 'Breast Cancer'  # We can change this to the cancer type you are interested in
filtered_data = df[df['cancer_type'] == cancer_type]

# Step 2: Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

# Preprocess the 'content' column
filtered_data['processed_content'] = filtered_data['content'].apply(preprocess_text)

# Step 3: Frequency Analysis
# Flatten the list of processed tokens
all_tokens = [token for sublist in filtered_data['processed_content'] for token in sublist]

# Count the frequency of each token
word_freq = Counter(all_tokens)

# Get the most common symptoms (e.g., top 10)
top_symptoms = word_freq.most_common(10)
print("Top 10 Common Symptoms for", cancer_type, ":\n", top_symptoms)

# Step 4: Visualization
# Create a word cloud for visualization
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(dict(top_symptoms))

# Plot the word cloud
# plt.figure(figsize=(10, 6))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.title('Top 10 Common Symptoms for ' + cancer_type)
# plt.axis('off')
# plt.show()


In [None]:
import nltk
nltk.download('wordnet')

In [None]:
import re
from collections import Counter

# Step 1: Define Relevant Keywords for Side Effects
side_effect_keywords = ["nausea", "fatigue", "hair loss", "vomiting", "pain", "weight loss"]  # Add more as needed

# Step 2: Text Processing
# Function to preprocess text (tokenization and lowercase)
def preprocess_text(text):
    # Tokenize text and convert to lowercase
    tokens = re.findall(r'\b\w+\b', text.lower())
    return tokens

# Preprocess the 'comments' and 'content' columns
df['processed_comments'] = df['comments'].apply(preprocess_text)
df['processed_content'] = df['content'].apply(preprocess_text)

# Step 3: Keyword Matching
# Function to count occurrences of side effect keywords
def count_side_effects(text_tokens):
    return Counter([token for token in text_tokens if token in side_effect_keywords])

# Count side effect occurrences in comments and content
df['side_effects_comments'] = df['processed_comments'].apply(count_side_effects)
df['side_effects_content'] = df['processed_content'].apply(count_side_effects)

# Step 4: Count Occurrences of Each Side Effect
# Combine counts from comments and content
combined_side_effects = df['side_effects_comments'] + df['side_effects_content']

# Calculate total occurrences of each side effect
total_side_effect_counts = combined_side_effects.sum()

# Print the result
for side_effect, count in total_side_effect_counts.items():
    print(f"{side_effect}: {count} mentions")


In [None]:
import re
from collections import Counter

# Step 1: Define Relevant Keywords for Side Effects
side_effect_keywords = [
    "nausea", "vomiting", "diarrhea", "constipation", "abdominal pain", "stomach pain",
    "loss of appetite", "weight loss", "fatigue", "weakness", "dizziness", "headache",
    "numbness", "tingling", "muscle pain", "joint pain", "back pain", "chest pain",
    "shortness of breath", "coughing", "high blood pressure", "low blood pressure",
    "irregular heartbeat", "palpitations", "fever", "chills", "night sweats", "infection",
    "sore throat", "difficulty swallowing", "mouth sores", "hair loss", "skin rash",
    "itching", "bruising", "bleeding", "anemia", "memory problems", "confusion",
    "anxiety", "depression", "hallucinations", "seizures", "tremors", "vision changes",
    "hearing changes", "difficulty walking", "loss of balance"
]


# Step 2: Text Processing
# Function to preprocess text (tokenization and lowercase)
def preprocess_text(text):
    # Tokenize text and convert to lowercase
    tokens = re.findall(r'\b\w+\b', text.lower())
    return tokens

# Preprocess the 'comments' and 'content' columns
df['processed_comments'] = df['comments'].apply(preprocess_text)
df['processed_content'] = df['content'].apply(preprocess_text)

# Step 3: Keyword Matching
# Function to count occurrences of side effect keywords
def count_side_effects(text_tokens):
    return Counter([token for token in text_tokens if token in side_effect_keywords])

# Define a function to apply to each group
def get_side_effect_counts(group):
    # Count side effect occurrences in comments and content for the current group
    group['side_effects_comments'] = group['processed_comments'].apply(count_side_effects)
    group['side_effects_content'] = group['processed_content'].apply(count_side_effects)

    # Combine counts from comments and content
    combined_side_effects = group['side_effects_comments'] + group['side_effects_content']

    # Calculate total occurrences of each side effect for the current group
    total_side_effect_counts = combined_side_effects.sum()

    return total_side_effect_counts

# Apply the function to each group (cancer type)
side_effect_counts_by_cancer_type = df.groupby('cancer_type').apply(get_side_effect_counts)

# Print the results
for cancer_type, side_effect_counts in side_effect_counts_by_cancer_type.items():
    print(f"Side effects for {cancer_type}:")
    for side_effect, count in side_effect_counts.items():
        print(f"{side_effect}: {count} mentions")
    print()


In [None]:
cancer_type_counts = df['cancer_type'].value_counts()
print(cancer_type_counts)

In [None]:
import pandas as pd
from itertools import permutations, combinations

# Extract unique cancer types
cancer_types = df['cancer_type'].unique()

# Find the maximum number of elements for combinations (all elements)
max_set_size = len(cancer_types)

# Prompt the user to input the size of the combination set
set_size = int(input("Enter the size of the combination set (1 to " + str(max_set_size) + "): "))

# Ensure the input is within a valid range
if set_size < 1 or set_size > max_set_size:
    print("Invalid set size. Please enter a value between 1 and", max_set_size)
else:

    # Get combinations
    print(f"\nCombinations for set size {set_size}:")
    for combo in combinations(cancer_types, set_size):
        print(combo)


In [None]:
import pandas as pd
from itertools import permutations, combinations

# Extract unique cancer types
cancer_types = df['cancer_type'].unique()

# Find the maximum number of elements for combinations (all elements)
max_set_size = len(cancer_types)

# Prompt the user to input the size of the combination set
set_size = int(input("Enter the size of the combination set (1 to " + str(max_set_size) + "): "))

# Ensure the input is within a valid range
if set_size < 1 or set_size > max_set_size:
    print("Invalid set size. Please enter a value between 1 and", max_set_size)
else:

    # Get combinations
    combinations_list = list(combinations(cancer_types, set_size))  # Store combinations in a list
    combination_count = len(combinations_list)  # Count the generated combinations

    print(f"\nCombinations for set size {set_size} (Total Number of Combinations: {combination_count}):")
    for combo in combinations_list:
        print(combo)


In [None]:
import pandas as pd
from itertools import permutations, combinations

# Assuming cancer types are in the 'cancer_type' column of your DataFrame 'df'
# Assuming 'side_effect_counts_by_cancer_type' holds side effect counts for each type (from previous code)

# Extract unique cancer types
cancer_types = df['cancer_type'].unique()

# Function to find commonly mentioned side effects within a set of cancer types
def find_common_symptoms(type_set):
    # Extract side effect counts for types in the set
    type_side_effects = {cancer_type: side_effect_counts_by_cancer_type.loc[cancer_type] for cancer_type in type_set}

    # Convert Counters to DataFrames and sum values
    type_side_effects_df = pd.DataFrame(type_side_effects.values())
    min_count = type_side_effects_df.sum(axis=0)

    common_symptoms = [side_effect for side_effect, count in min_count.items() if count > 0]
    return common_symptoms

# Get user input for set size
max_set_size = len(cancer_types)
set_size = int(input("Enter the size of the combination set (1 to " + str(max_set_size) + "): "))

if set_size < 1 or set_size > max_set_size:
    print("Invalid set size. Please enter a value between 1 and", max_set_size)
else:
    # Generate combinations and count them
    combinations_list = list(combinations(cancer_types, set_size))
    combination_count = len(combinations_list)

    print(f"\nCombinations for set size {set_size} (Total Number of Combinations: {combination_count}):")

    # Find common symptoms for each combination
    for combo in combinations_list:
        common_symptoms = find_common_symptoms(combo)
        print(f"Common symptoms for {combo}: {common_symptoms}")


In [None]:
import pandas as pd
from itertools import combinations

# Assuming cancer types are in the 'cancer_type' column of your DataFrame 'df'
# Assuming 'side_effect_counts_by_cancer_type' holds side effect counts for each type (from previous code)

# Extract unique cancer types
cancer_types = df['cancer_type'].unique()

# Function to find commonly mentioned side effects within a set of cancer types
def find_common_symptoms(type_set):
    # Extract side effect counts for types in the set
    type_side_effects = {cancer_type: side_effect_counts_by_cancer_type.loc[cancer_type] for cancer_type in type_set}

    # Convert Counters to DataFrames and sum values
    type_side_effects_df = pd.DataFrame(type_side_effects.values())
    min_count = type_side_effects_df.sum(axis=0)

    common_symptoms = [side_effect for side_effect, count in min_count.items() if count > 0]
    return common_symptoms

# Menu card for cancer type selection
print("\n**Menu Card for Cancer Type Selection**")
print("Select cancer types (enter corresponding numbers separated by spaces):")
for i, cancer_type in enumerate(cancer_types):
    print(f"{i+1}. {cancer_type}")

# User input for cancer type selection
while True:
    user_choice = input("\nEnter cancer type numbers (separated by spaces) or 'q' to quit: ")
    if user_choice.lower() == 'q':
        break
    try:
        # Convert user input to a set of integers (handles duplicates)
        selected_cancer_indices = set(int(num) - 1 for num in user_choice.split())  # Adjust for 0-based indexing

        # Validate user input (ensure chosen indices are within range)
        if not all(0 <= i < len(cancer_types) for i in selected_cancer_indices):
            print("Invalid cancer type number(s). Please enter numbers between 1 and", len(cancer_types))
            continue

        # Extract selected cancer types
        selected_cancer_types = tuple([cancer_types[i] for i in selected_cancer_indices])

        # Find common symptoms based on chosen cancer types
        common_symptoms = find_common_symptoms(selected_cancer_types)
        print(f"\nCommon symptoms for {selected_cancer_types}: {common_symptoms}")

    except ValueError:
        print("Invalid input. Please enter integers separated by spaces or 'q' to quit.")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords  # Import stopwords


def analyze_top_discussion_topics_per_type(df, n=10):
    # Group discussions by cancer type
    cancer_type_groups = df.groupby("cancer_type")

    # Analyze topics for each cancer type
    for cancer_type, group_df in cancer_type_groups:
        print(f"\nTop {n} Most Frequent Topics for {cancer_type} Discussions:")

        # Create a new column for combined text (replace with your actual column names)
        group_df["text"] = group_df["cancer_type"] + " " + group_df["title"] + " " + group_df["content"] + " " + group_df["comments"].str.join(" ")

        # Text preprocessing
        stop_words = set(stopwords.words('english'))  # Create stop word set
        group_df["text"] = group_df["text"].str.lower()  # Lowercase conversion
        group_df["text"] = group_df["text"].str.replace("[^a-zA-Z0-9\s]", "")  # Remove punctuation
        group_df["text"] = group_df["text"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]) if isinstance(x, str) else x)  # Handles non-strings gracefully

        # Tokenization
        tokens = group_df["text"].apply(lambda x: word_tokenize(x) if isinstance(x, str) else [])

        # Generate bigrams (2-word phrases)
        all_bigrams = pd.Series(tokens.sum()).reset_index(name='word')  # Create DataFrame from word counts
        #all_bigrams = all_bigrams.rename(columns={'word': 'count'})  # Optional: rename column

        #print(all_bigrams.columns)
        # Count occurrences (prioritize bigrams if trigrams are unavailable)
        topic_counts = all_bigrams.groupby('word')['word'].sum().sort_values(ascending=False)  # Replace with actual column name
        top_n_grams = topic_counts.head(n)

        # Print top n most frequent topics
        for topic, count in top_n_grams.items():
            print(f"- {topic}: {count} mentions")


# Load your cancer discussion dataset into a DataFrame named 'df' (replace with your actual loading steps)
df = pd.read_csv("/content/preprocessed_data_csv.csv")

# Analyze top discussion topics for each cancer type
analyze_top_discussion_topics_per_type(df.copy())


In [None]:

# Parse date strings into datetime objects (adjust if date format is different)
df['date'] = pd.to_datetime(df['date'])

# Get minimum and maximum dates
min_date = df['date'].min()
max_date = df['date'].max()

# Calculate the number of years
num_years = (max_date.year - min_date.year)

# Calculate the difference in days (consider hours too)
time_delta = max_date - min_date

# Print results
print("Range of Dates:")
print(f"- Oldest Date: {min_date.strftime('%Y-%m-%d')}")
print(f"- Latest Date: {max_date.strftime('%Y-%m-%d')}")
print(f"- Number of Years: {num_years}")
print(f"- Number of Days: {time_delta.days}")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Initialize VADER
sid = SentimentIntensityAnalyzer()

# 2. Identifying Specific Emotions
def identify_emotions(text):
    if isinstance(text, str):  # Check if text is a string
        sentiment_score = sid.polarity_scores(text)
        emotion = max(sentiment_score, key=sentiment_score.get)
        return emotion
    else:
        return None  # Return None for NaN values

# Apply the function to the 'content' column
df['emotion'] = df['content'].apply(identify_emotions)


In [None]:
import nltk
nltk.download('vader_lexicon')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# 3. Topic-Specific Sentiment
def get_topics_sentiment(texts):
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    X = vectorizer.fit_transform(texts.fillna(''))  # Fill NaN values with empty string

    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(X)

    feature_names = vectorizer.get_feature_names_out()

    topic_sentiments = []
    for text in texts:
        X_text = vectorizer.transform([text])
        topic_mixture = lda.transform(X_text)[0]
        top_topics_idx = topic_mixture.argsort()[-5:]  # Get top 5 topics
        topic_features = [feature_names[idx] for idx in top_topics_idx]
        topic_sentiments.append(topic_features)

    return topic_sentiments

df['topics_sentiment'] = get_topics_sentiment(df['content'].fillna(''))  # Fill NaN values with empty string


In [None]:
# 4. Comparative Sentiment Analysis
def compare_sentiment_across_groups(data, group_column, text_column):
    sentiments_by_group = {}
    groups = data[group_column].unique()

    for group in groups:
        group_data = data[data[group_column] == group]
        sentiments = [sid.polarity_scores(text)['compound'] for text in group_data[text_column].fillna('')]  # Fill NaN values with empty string
        sentiments_by_group[group] = sentiments

    return sentiments_by_group

sentiments_by_cancer_type = compare_sentiment_across_groups(df, 'cancer_type', 'content')

In [None]:
# Displaying the results
print("Emotions Identified:")
print(df['emotion'].value_counts())

print("\nTopic-Specific Sentiment:")
for idx, features in enumerate(df['topics_sentiment']):
    print(f"Topic {idx + 1}: {', '.join(features)}")

print("\nComparative Sentiment Analysis by Cancer Type:")
for cancer_type, sentiments in sentiments_by_cancer_type.items():
    print(f"{cancer_type}: Mean Sentiment {np.mean(sentiments):.2f}")


In [None]:
import spacy
from collections import Counter
import pandas as pd

# Load the English NER model from spaCy
nlp = spacy.load("en_core_web_sm")

# Function to identify opinion targets using NER
def identify_opinion_targets(text):
    doc = nlp(text)
    opinion_targets = []
    for ent in doc.ents:
        if ent.label_ in ['ORG', 'PRODUCT']:  # Consider only organizations and products as opinion targets
            opinion_targets.append(ent.text)
    return opinion_targets

# Function to perform frequency analysis of nouns
def frequency_analysis_of_nouns(texts):
    nouns = []
    for text in texts:
        doc = nlp(text)
        for token in doc:
            if token.pos_ == 'NOUN':
                nouns.append(token.text)
    return Counter(nouns)

# Example dataframe (replace this with your actual dataframe)
data = {
    'content': ['Some text', 'Some other text', float('nan'), 'More text']
}
df = pd.DataFrame(data)

# Replace NaN values with empty strings
df['content'] = df['content'].fillna('')

df['opinion_targets_ner'] = df['content'].apply(identify_opinion_targets)
noun_frequencies = frequency_analysis_of_nouns(df['content'])

# Display the opinion targets identified using NER
print("\nOpinion Targets Identified using Named Entity Recognition (NER):")
for idx, targets in enumerate(df['opinion_targets_ner']):
    if targets:
        print(f"Opinion Targets for entry {idx + 1}: {', '.join(targets)}")

# Display the most frequent nouns
print("\nMost Frequent Nouns:")
print(noun_frequencies.most_common(10))


In [None]:
def get_sentiment_and_emotions(text):
  """
  This function analyzes sentiment and identifies emotions in a text string.

  Args:
      text: The text string to be analyzed.

  Returns:
      A dictionary containing sentiment polarity (positive, neutral, negative),
      subjectivity score (0-1), and a list of detected emotions.
  """
  blob = TextBlob(text)
  sentiment = {
      "polarity": blob.sentiment.polarity,  # Positive: > 0, Neutral: 0, Negative: < 0
      "subjectivity": blob.sentiment.subjectivity  # 0: objective, 1: subjective
  }
  emotions = []
  # Customize emotion detection based on your needs (consider VADER for more options)
  if blob.sentiment.polarity > 0.5:
    emotions.append("joy")  # Add more positive emotions as needed
  elif blob.sentiment.polarity < -0.5:
    emotions.append("anger")  # Add more negative emotions as needed
  # Add logic to identify other emotions based on sentiment scores and word patterns

  return {"sentiment": sentiment, "emotions": emotions}


In [None]:
# Create a new column to store sentiment analysis results
df['sentiment_analysis'] = df['content'].apply(get_sentiment_and_emotions)


In [None]:
# Get all emotions detected across reviews (handle potential empty lists)
all_emotions = []
for sentiment in df['sentiment_analysis']:
  if sentiment:  # Check if sentiment is not empty
    all_emotions.extend(sentiment['emotions'])

# Count emotion occurrences (handle potential empty list)
if all_emotions:  # Check if all_emotions has elements before creating Series
  emotion_counts = pd.Series(all_emotions).value_counts()
  print("Emotion Distribution:")
  print(emotion_counts)

  # Visualize emotion distribution (optional)
  emotion_counts.plot(kind='bar')
  plt.title("Distribution of Emotions in Reviews")
  plt.show()
else:
  print("No emotions detected in the data.")


In [None]:
# Define topics/features of interest (replace with your actual topics)
topics = ["treatment", "side effects", "survival rate"]

# Analyze sentiment for each topic
for topic in topics:
  topic_reviews = df[df['content'].str.contains(topic, case=False)]  # Case-insensitive search
  topic_sentiment = topic_reviews['sentiment_analysis'].apply(lambda x: x['sentiment']['polarity']).mean()
  print(f"Average sentiment for topic '{topic}': {topic_sentiment:.2f}")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Load the dataset
df = pd.read_csv('/content/preprocessed_data_csv.csv')

# Convert 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Drop rows with NaN values in 'content' column
df = df.dropna(subset=['content'])

# Initialize VADER for sentiment analysis
sid = SentimentIntensityAnalyzer()

# Function to perform sentiment analysis and return compound score
def get_sentiment_score(text):
    sentiment_score = sid.polarity_scores(text)
    return sentiment_score['compound']

# Apply sentiment analysis to each post
df['sentiment_score'] = df['content'].apply(get_sentiment_score)

# Extract year, month, season, and festival from the date
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['season'] = df['date'].dt.month % 12 // 3 + 1  # Calculate season based on month

# Function to identify festival based on month
def identify_festival(month):
    if month == 1:
        return 'New Year'
    elif month == 5:
        return 'Cinco de Mayo'
    elif month == 7:
        return 'Independence Day'
    elif month == 10:
        return 'Halloween'
    elif month == 12:
        return 'Christmas'
    else:
        return 'Other'

df['festival'] = df['month'].apply(identify_festival)

# Group by year, month, and festival, calculate average sentiment
sentiment_by_time_and_festival = df.groupby(['year', 'month', 'season', 'festival']).agg({'sentiment_score': 'mean'}).reset_index()

# Plot trend analysis by season and festival
plt.figure(figsize=(8, 5))
sns.barplot(data=sentiment_by_time_and_festival, x='season', y='sentiment_score', hue='festival')
plt.title('Sentiment Analysis by Season and Festival')
plt.xlabel('Season')
plt.ylabel('Average Sentiment Score')
plt.xticks(ticks=[0, 1, 2, 3], labels=['Spring', 'Summer', 'Autumn', 'Winter'])
plt.legend(title='Festival')
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Step 1: Data Preprocessing
# Assuming your dataset is stored in a DataFrame called df
# Further preprocessing may be required based on specific needs
# For example, removing irrelevant columns, handling missing values, etc.

# Step 2: Graph Creation
G = nx.Graph()

# Add nodes (users)
G.add_nodes_from(df['username'], bipartite=0)
G.add_nodes_from(df['title'], bipartite=1)

# Add edges (interactions)
for index, row in df.iterrows():
    G.add_edge(row['username'], row['title'])

# Step 3: Community Detection
communities = nx.algorithms.community.greedy_modularity_communities(G)

# Step 4: Influence Analysis
# Calculate centrality measures
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
eigenvector_centrality = nx.eigenvector_centrality(G)

# Step 5: Social Media Graph Analytics
# Calculate network metrics
degree_distribution = nx.degree_histogram(G)
clustering_coefficient = nx.average_clustering(G)
average_shortest_path_length = nx.average_shortest_path_length(G)

# Step 6: Visualization
# Visualize the graph
pos = nx.spring_layout(G)  # Positions for all nodes
nx.draw(G, pos, with_labels=False, node_size=10)
plt.show()

# Visualize communities
for i, community in enumerate(communities):
    nx.draw_networkx_nodes(G, pos, nodelist=community, node_color=f"C{i}", node_size=10)
nx.draw_networkx_edges(G, pos, alpha=0.5)
plt.show()
