<a href="https://colab.research.google.com/github/MK316/Spring2024/blob/main/DLTESOL/P1F_TESOL_0507.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
url = "https://raw.githubusercontent.com/MK316/Spring2024/main/DLTESOL/data/MP1-TESOL.csv"

df = pd.read_csv(url)

df.head()

In [None]:
import matplotlib.pyplot as plt

# Assuming the dataframe 'df' is already loaded and has the structure as mentioned
# First, let's ensure the questions Q1 to Q5 are of numeric type
df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']] = df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].apply(pd.to_numeric, errors='coerce')

# Calculate mean scores for each group and question
group_means = df.groupby('Group')[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].mean()

# Plotting
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20, 5))  # Adjust the number of rows and columns based on the number of groups

for i, (group, data) in enumerate(group_means.iterrows()):
    ax = axes[i]
    data.plot(kind='bar', ax=ax, color=['blue', 'orange', 'green', 'red', 'purple'])
    ax.set_title(f'Average Scores for {group}')
    ax.set_ylim(3, 5)  # Assuming the scores range from 1 to 5
    ax.set_ylabel('Average Score')
    ax.set_xlabel('Questions')

plt.tight_layout()
plt.show()


# By questions

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Assuming the dataframe 'df' is already loaded and has the structure as mentioned
# Ensure the questions Q1 to Q5 are of numeric type
df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']] = df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].apply(pd.to_numeric, errors='coerce')

# Transpose the data to make questions the columns and groups the rows
group_means = df.groupby('Group')[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].mean().transpose()

# Plotting
fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(20, 5))  # Adjust the number of columns based on the number of questions

questions = ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']  # List of questions

for i, question in enumerate(questions):
    ax = axes[i]
    group_means.loc[question].plot(kind='bar', ax=ax, color=['blue', 'orange', 'green', 'red', 'purple'])
    ax.set_title(f'Group Comparison for {question}')
    ax.set_ylim(3, 5)  # Assuming the scores range from 1 to 5
    ax.set_ylabel('Average Score')
    ax.set_xlabel('Groups')

plt.tight_layout()
plt.show()


# Radar chart

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from math import pi

# Assuming 'df' is your DataFrame and already loaded
df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']] = df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].apply(pd.to_numeric, errors='coerce')

# Get the mean scores for each group
group_means = df.groupby('Group')[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].mean()

# Number of variables we're plotting.
categories = list(group_means.columns)
N = len(categories)

# What will be the angle of each axis in the plot? (divide the plot / number of variable)
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]

# Initialize the spider plot
fig, axes = plt.subplots(figsize=(8, 8), nrows=2, ncols=2, subplot_kw=dict(polar=True))
axes = axes.flatten()  # Flatten the array to iterate easily

for ax, (group, row) in zip(axes, group_means.iterrows()):
    data = row.tolist()
    data += data[:1]  # complete the loop

    # Draw one axe per group with a color and linestyle
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)

    # Draw one axe per variable and add labels
    plt.xticks(angles[:-1], categories, color='grey', size=8)

    # Draw ylabels
    ax.set_rlabel_position(0)
    ax.set_ylim(0, 5)  # Assuming score scale is 0-5

    # Plot data
    ax.plot(angles, data, linewidth=2, linestyle='solid', label=group)
    ax.fill(angles, data, alpha=0.4)

    # Add a title
    ax.set_title(group, size=11, color=ax.get_lines()[-1].get_color(), y=1.1)

# Adjust layout
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from math import pi

# Assuming 'df' is your DataFrame and already loaded
df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']] = df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].apply(pd.to_numeric, errors='coerce')

# Get the mean scores for each group
group_means = df.groupby('Group')[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].mean()

# Number of variables (i.e., questions)
categories = ['Q1-Inform', 'Q2-UseDT', 'Q3-Integration', 'Q4-Engagement', 'Q5-Clarity']
N = len(categories)

# Calculate angle for each category
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]  # Complete the loop for the radar chart

# Initialize the spider plot
fig, axes = plt.subplots(figsize=(10, 10), nrows=2, ncols=2, subplot_kw=dict(polar=True))
axes = axes.flatten()  # Flatten the array to iterate easily

for ax, (group, row) in zip(axes, group_means.iterrows()):
    data = row.tolist()
    data += data[:1]  # Complete the loop

    # Draw one axe per variable and add labels
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)

    # Draw ylabels
    ax.set_rlabel_position(0)
    ax.set_ylim(0, 5)  # Assuming score scale is 0-5

    # Setting the labels for questions
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories)

    # Plot data
    ax.plot(angles, data, linewidth=2, linestyle='solid', label=group)
    ax.fill(angles, data, alpha=0.4)

    # Add a title
    ax.set_title(group, size=11, color=ax.get_lines()[-1].get_color(), y=1.1)

# Adjust layout
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from math import pi

# Assuming 'df' is your DataFrame and already loaded
df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']] = df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].apply(pd.to_numeric, errors='coerce')

# Get the mean scores for each group
group_means = df.groupby('Group')[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].mean()

# Number of variables (i.e., questions)
categories = ['Q1-Inform', 'Q2-UseDT', 'Q3-Integration', 'Q4-Engagement', 'Q5-Clarity']
N = len(categories)

# Calculate angle for each category
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]  # Complete the loop for the radar chart

# Initialize the spider plot
fig, axes = plt.subplots(figsize=(10, 10), nrows=2, ncols=2, subplot_kw=dict(polar=True))
axes = axes.flatten()  # Flatten the array to iterate easily

for ax, (group, row) in zip(axes, group_means.iterrows()):
    data = row.tolist()
    data += data[:1]  # Complete the loop

    # Draw one axe per variable and add labels
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)

    # Draw ylabels and set y-axis limits from 3 to 5
    ax.set_rlabel_position(0)
    ax.set_ylim(3, 5)

    # Setting the labels for questions
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories)

    # Plot data
    ax.plot(angles, data, linewidth=2, linestyle='solid', label=group)
    ax.fill(angles, data, alpha=0.4)

    # Add a title
    ax.set_title(group, size=11, color=ax.get_lines()[-1].get_color(), y=1.1)

# Adjust layout
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from math import pi

# Assuming 'df' is your DataFrame and already loaded
df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']] = df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].apply(pd.to_numeric, errors='coerce')

# Get the mean scores for each group, excluding Q2
group_means = df.groupby('Group')[['Q1', 'Q3', 'Q4', 'Q5']].mean()

# Number of variables (i.e., questions), excluding Q2
categories = ['Q1-Inform', 'Q3-Integration', 'Q4-Engagement', 'Q5-Clarity']
N = len(categories)

# Calculate angle for each category
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]  # Complete the loop for the radar chart

# Initialize the spider plot
fig, axes = plt.subplots(figsize=(10, 10), nrows=2, ncols=2, subplot_kw=dict(polar=True))
axes = axes.flatten()  # Flatten the array to iterate easily

for ax, (group, row) in zip(axes, group_means.iterrows()):
    data = row.tolist()
    data += data[:1]  # Complete the loop

    # Draw one axe per variable and add labels
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)

    # Draw ylabels and set y-axis limits from 3 to 5
    ax.set_rlabel_position(0)
    ax.set_ylim(3, 5)

    # Setting the labels for questions
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories)

    # Plot data
    ax.plot(angles, data, linewidth=2, linestyle='solid', label=group)
    ax.fill(angles, data, alpha=0.4)

    # Add a title
    ax.set_title(group, size=11, color=ax.get_lines()[-1].get_color(), y=1.1)

# Adjust layout
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from math import pi

# Assuming 'df' is your DataFrame and already loaded
df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']] = df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].apply(pd.to_numeric, errors='coerce')

# Get the mean scores for each group
group_means = df.groupby('Group')[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].mean()

# Number of variables (i.e., questions)
categories = ['Q1-Informativeness', 'Q2-UseDT', 'Q3-Integration', 'Q4-Engagement', 'Q5-Clarity']
N = len(categories)

# Calculate angle for each category
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]  # Complete the loop for the radar chart

# Initialize the spider plot
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

# Define colors for each group
colors = ['b', 'g', 'r', 'c']  # Blue, Green, Red, Cyan for G1, G2, G3, G4

for (group, data), color in zip(group_means.iterrows(), colors):
    values = data.tolist()
    values += values[:1]  # complete the loop

    # Plot data
    ax.plot(angles, values, linewidth=2, linestyle='solid', label=group, color=color)
    ax.fill(angles, values, alpha=0.25, color=color)  # Adjust transparency here

    # Draw one axe per variable and add labels
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)

    # Draw ylabels
    ax.set_rlabel_position(0)
    ax.set_ylim(3, 5)  # Assuming score scale is 0-5

# Setting the labels for questions
ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories)

# Add legend
ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1.2))

# Show plot
plt.show()


# Sentiment Analysis

In [None]:
import pandas as pd
from textblob import TextBlob

# Sample data loading or assuming df is already loaded
# df = pd.read_csv('path_to_your_file.csv')

# Combine comments for each group
df['Comments'] = df['Comments'].astype(str)  # Ensure comments are all strings
grouped_comments = df.groupby('Group')['Comments'].apply(' '.join).reset_index()

# Define a function to get both polarity and subjectivity
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity, analysis.sentiment.subjectivity

# Apply sentiment analysis to get both scores
grouped_comments['Polarity'], grouped_comments['Subjectivity'] = zip(*grouped_comments['Comments'].apply(get_sentiment))

# Display the results
print(grouped_comments[['Group', 'Polarity', 'Subjectivity']])


Polarity column to add

In [None]:
from textblob import TextBlob

# Assuming 'df' is your DataFrame and 'Comments' column contains the text comments
# First, ensure comments are string
df['Comments'] = df['Comments'].astype(str)

# Calculate polarity for each comment
df['Polarity'] = df['Comments'].apply(lambda comment: TextBlob(comment).sentiment.polarity)


In [None]:

# Define a function to categorize sentiment based on polarity
def categorize_sentiment(polarity):
    if polarity < -0.1:
        return 'Negative'
    elif polarity > 0.1:
        return 'Positive'
    else:
        return 'Neutral'

# Apply the categorization function
df['Sentiment_Category'] = df['Polarity'].apply(categorize_sentiment)


In [None]:
# Group data by Group and Sentiment_Category and count occurrences
sentiment_counts = df.groupby(['Group', 'Sentiment_Category']).size().unstack(fill_value=0)

# Plotting the data
sentiment_counts.plot(kind='bar', stacked=True, color=['red', 'gray', 'green'])
plt.title('Sentiment Distribution per Group')
plt.xlabel('Group')
plt.ylabel('Number of Comments')
plt.xticks(rotation=0)  # Ensure group labels are horizontal for readability
plt.show()


wordcloud

In [None]:
!pip install wordcloud


In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import pandas as pd


In [None]:
# Assuming df is your DataFrame and 'Comments' column contains text comments
df['Comments'] = df['Comments'].astype(str)  # Ensure all comments are string type
grouped_comments = df.groupby('Group')['Comments'].apply(' '.join).reset_index()



In [None]:
# Set up the plot with subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 10))  # Adjust nrows and ncols based on the number of groups
axes = axes.flatten()  # Flatten the array to make it easier to iterate

for i, row in grouped_comments.iterrows():
    ax = axes[i]
    # Generate word cloud
    wordcloud = WordCloud(width=800, height=400, background_color ='white').generate(row['Comments'])

    # Display the word cloud
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')  # Turn off axis
    ax.set_title(f"Group {row['Group']}")

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Assuming 'df' is your DataFrame and is already loaded
df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']] = df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].apply(pd.to_numeric, errors='coerce')


In [None]:
# Group the data by 'Group' and describe each group's statistics for Q1 to Q5
group_stats = df.groupby('Group')[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].describe()

# Print the statistics for each group
print(group_stats)


In [None]:
# Extract mean and std deviation only
group_mean_std = df.groupby('Group')[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].agg(['mean', 'std'])
print(group_mean_std)


In [None]:
import pandas as pd

# Assuming df is your DataFrame and it is already loaded
# Make sure that Q1 to Q5 are numeric
df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']] = df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].apply(pd.to_numeric, errors='coerce')

# Calculate the mean of Q1 to Q5 for each group
group_means = df.groupby('Group')[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].mean()

# Print the calculated means for checking
print(group_means)


In [None]:
import matplotlib.pyplot as plt

# Create a bar plot for the means of each question
group_means.plot(kind='bar', figsize=(12, 6))
plt.title('Average Scores for Q1 to Q5 Across Groups')
plt.xlabel('Group')
plt.ylabel('Average Scores')
plt.xticks(rotation=0)  # Keep the group labels horizontal for better readability
plt.legend(title='Questions')
plt.show()


In [None]:
import pandas as pd

# Assuming df is your DataFrame and it is already loaded
# Ensure that Q1 to Q5 are numeric
df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']] = df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].apply(pd.to_numeric, errors='coerce')

# Calculate the mean of Q1 to Q5 for each group
group_means = df.groupby('Group')[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].mean()

# Compute the mean of the mean scores across all questions for each group
group_means['Overall Mean'] = group_means.mean(axis=1)

# Print the overall means for checking
print(group_means['Overall Mean'])


In [None]:
import matplotlib.pyplot as plt

# Create a bar plot for the overall mean scores
group_means['Overall Mean'].plot(kind='bar', color='skyblue', figsize=(10, 6))
plt.title('Overall Average Score of Q1 to Q5 for Each Group')
plt.xlabel('Group')
plt.ylabel('Overall Average Score')
plt.ylim(3,5)
plt.xticks(rotation=0)  # Keep the group labels horizontal for better readability
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming group names are like G1, G2, G3, G4, and are indexed as such in 'group_means'
group_labels = group_means.index  # Adjust based on your DataFrame index or manually define
colors = ['blue', 'green', 'red', 'purple']  # Specify a color for each group

# Define a dictionary to hold color mapping for better automation and clarity
color_map = {group: color for group, color in zip(group_labels, colors)}

# Now plot each group individually to control colors and labels
for group, color in color_map.items():
    plt.bar(group, group_means.loc[group, 'Overall Mean'], color=color, label=group)

plt.title('Overall Average Score of Q1 to Q5 for Each Group')
plt.xlabel('Group')
plt.ylabel('Overall Average Score')
plt.ylim(3,5)
plt.xticks(np.arange(len(group_labels)), group_labels)  # Set group labels as x-tick labels

# Place the legend outside of the plot
plt.legend(title='Group', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.show()
