In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Load the data
data = pd.read_csv("exercise1.csv")

# Separate AI and human responses
ai_responses = data[data['source'] == 'AI']['response']
human_responses = data[data['source'] == 'Human']['response']

# Preprocess the text
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(tokens)

ai_responses = ai_responses.apply(preprocess_text)
human_responses = human_responses.apply(preprocess_text)

# Measure lexical diversity
def lexical_diversity(texts):
    total_words = sum(len(text.split()) for text in texts)
    unique_words = len(set(" ".join(texts).split()))
    return unique_words / total_words

ai_diversity = lexical_diversity(ai_responses)
human_diversity = lexical_diversity(human_responses)

print(f"AI Lexical Diversity: {ai_diversity}")
print(f"Human Lexical Diversity: {human_diversity}")

# Generate word clouds
ai_wordcloud = WordCloud(background_color='white').generate(" ".join(ai_responses))
human_wordcloud = WordCloud(background_color='white').generate(" ".join(human_responses))

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.imshow(ai_wordcloud, interpolation='bilinear')
plt.title("AI Word Cloud")
plt.axis("off")

plt.subplot(1, 2, 2)
plt.imshow(human_wordcloud, interpolation='bilinear')
plt.title("Human Word Cloud")
plt.axis("off")

plt.show()