In [None]:
!pip install pandas transformers torch scikit-learn nltk datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.util import ngrams
from collections import Counter
from nltk.corpus import stopwords
import nltk

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Load the CSV file
file_path = '/content/Leader Reflections.csv'
data = pd.read_csv(file_path)

# Assuming the text data is in a column named 'Reflection/Experience'
text_data = data['Reflection/Experience'].astype(str).tolist()

# Get English stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords from text
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

# Function to extract n-grams
def extract_ngrams(text, n):
    tokens = text.split()
    return list(ngrams(tokens, n))

# Remove stopwords from each text entry
cleaned_text_data = [remove_stopwords(text) for text in text_data]

# Extracting n-grams for each cleaned text entry
n = 2
all_ngrams = [extract_ngrams(text, n) for text in cleaned_text_data]

# Flatten the list of n-grams and count them
flat_ngrams = [ngram for sublist in all_ngrams for ngram in sublist]
ngram_counts = Counter(flat_ngrams)

# Display the most common n-grams
print("Most Common N-grams:")
for ngram, count in ngram_counts.most_common(10):
    print(f"{' '.join(ngram)}: {count}")

# Create a TF-IDF Vectorizer and fit it to the cleaned text data
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_text_data)

# Get feature names (words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create a DataFrame for TF-IDF scores
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Display the top keywords based on TF-IDF scores for each document
print("Top Keywords based on TF-IDF:")
for index, row in tfidf_df.iterrows():
    top_keywords = row.nlargest(5)  # Get top 5 keywords
    print(f"Document {index + 1}: {top_keywords.index.tolist()}")

# Save extracted themes and keywords to CSV
output_df = pd.DataFrame({
    'Document': range(1, len(cleaned_text_data) + 1),
    'Top Keywords': [row.nlargest(5).index.tolist() for _, row in tfidf_df.iterrows()]
})

output_df.to_csv('extracted_themes_and_keywords.csv', index=False)
print("Extracted themes and keywords saved to 'extracted_themes_and_keywords.csv'")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Most Common N-grams:
group project: 9
storming stage,: 9
storming stage: 9
impact project: 5
within team.: 4
group also: 4
group members: 4
gave us: 3
team members: 3
different perspectives: 3
Top Keywords based on TF-IDF:
Document 1: ['patients', 'optimism', 'art', 'hope', 'sense']
Document 2: ['future', 'us', 'well', 'accomplishment', 'acquire']
Document 3: ['program', 'drawings', 'form', 'could', 'also']
Document 4: ['goals', 'success', 'chosen', 'plans', 'recipients']
Document 5: ['history', 'bombers', 'heavy', 'impact', 'collaborated']
Document 6: ['elderly', 'being', 'improving', 'well', 'aspects']
Document 7: ['stage', 'group', 'day', 'absent', 'accepting']
Document 8: ['decision', 'without', 'group', 'member', 'make']
Document 9: ['others', 'better', 'environment', 'positive', 'within']
Document 10: ['level', 'team', 'experience', 'captured', 'characters']
Document 11: ['comes', 'goal', 'solving', 'problem', 'others']
Document 12: ['children', 'made', 'acknowledge', 'battles', 

In [None]:
# Leader's Role in Group Development categories and their associated behaviors
leader_roles = {
    "Leadership Role": {
        "Commonly Shared Behaviors": [
            "Guidance and Direction",
            "Conflict Management",
            "Team Cohesion",
            "Motivating and Inspiring",
            "Participation Encouragement",
            "Effective Communication",
            "Personal and Team Development Support",
            "Recognition and Appreciation"
        ],
        "Experiencing Leadership Role": [
            "Clear Instructions",
            "Mediating Conflicts",
            "Building Trust",
            "Positive Reinforcement",
            "Active Involvement",
            "Open Communication Channels",
            "Encouraging Skill Improvement",
            "Acknowledging Contributions"
        ],
        "Celebrating Success": [
            "Setting Expectations",
            "Handling Disagreements",
            "Promoting Unity",
            "Boosting Team Morale",
            "Ensuring Engagement",
            "Addressing Miscommunication",
            "Supporting Member Growth",
            "Regular Positive Feedback"
        ],
        "New Insights": [
            "Productive Team Performance",
            "Overcoming Challenges",
            "Group Achievement",
            "High Morale and Motivation",
            "Celebrating Team Contributions",
            "Successful Team Discussions",
            "Recognition of Individual Progress",
            "Celebrating Milestones"
        ]
    }
}

# Convert the structured data into a DataFrame for easier manipulation
data_roles = []
for category, subcategories in leader_roles.items():
    for subcategory, behaviors in subcategories.items():
        for behavior in behaviors:
            data_roles.append({
                'Category': category,
                'Subcategory': subcategory,
                'Behavior': behavior
            })

df_roles = pd.DataFrame(data_roles)

# Display the DataFrame of leadership roles
print(df_roles)

# Save the DataFrame to a CSV file
output_file_path = 'leaders_role_in_group_development.csv'
df_roles.to_csv(output_file_path, index=False)
print(f"Data saved to {output_file_path}")

           Category                   Subcategory  \
0   Leadership Role     Commonly Shared Behaviors   
1   Leadership Role     Commonly Shared Behaviors   
2   Leadership Role     Commonly Shared Behaviors   
3   Leadership Role     Commonly Shared Behaviors   
4   Leadership Role     Commonly Shared Behaviors   
5   Leadership Role     Commonly Shared Behaviors   
6   Leadership Role     Commonly Shared Behaviors   
7   Leadership Role     Commonly Shared Behaviors   
8   Leadership Role  Experiencing Leadership Role   
9   Leadership Role  Experiencing Leadership Role   
10  Leadership Role  Experiencing Leadership Role   
11  Leadership Role  Experiencing Leadership Role   
12  Leadership Role  Experiencing Leadership Role   
13  Leadership Role  Experiencing Leadership Role   
14  Leadership Role  Experiencing Leadership Role   
15  Leadership Role  Experiencing Leadership Role   
16  Leadership Role           Celebrating Success   
17  Leadership Role           Celebrating Succ

In [None]:
import pandas as pd
import re
import spacy

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

def extract_sentences_with_keywords(text, keywords):
    # Preprocess text to ensure proper sentence splitting
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]

    # Dictionary to store sentence-keywords pairs
    sentence_keywords = {}

    # Look for keywords in sentences
    for sentence in sentences:
        found_keywords = []
        for keyword in keywords:
            # Create pattern that matches the keyword (including word boundaries)
            pattern = r'\b' + re.escape(keyword) + r'\b'
            if re.search(pattern, sentence.lower()):
                found_keywords.append(keyword)

        # If keywords were found in this sentence
        if found_keywords:
            sentence_keywords[sentence] = found_keywords

    return sentence_keywords

# Define your original keywords
keywords = [
    'patients', 'optimism', 'art', 'hope', 'sense',
'future', 'us', 'well', 'accomplishment', 'acquire',
'program', 'drawings', 'form', 'could', 'also',
'goals', 'success', 'chosen', 'plans', 'recipients',
'history', 'bombers', 'heavy', 'impact', 'collaborated',
'elderly', 'being', 'improving', 'well', 'aspects',
'stage', 'group', 'day', 'absent', 'accepting',
'decision', 'without', 'group', 'member', 'make',
'others', 'better', 'environment', 'positive', 'within',
'level', 'team', 'experience', 'captured', 'characters',
'comes', 'goal', 'solving', 'problem', 'others',
'children', 'made', 'acknowledge', 'battles', 'cheerful',
'activité’, ‘game’, ‘since’, ‘other’, ‘become’,
'podcast', 'impact', '4th', 'capable', 'episodes',
'varied', 'collaboration', 'goal', 'individual', 'communication',
'working’, ‘group’, ‘alone’, ‘completion’, ‘depends’,
'good’, ‘admit’, ‘doubt’, ‘pressure’, ‘pushed’,
'art’, ‘therapy’, ‘recipients’, ‘individuals’, ‘express’,
'film’, ‘project’, ‘start’, ‘made’, ‘impact’,
'discussions’, ‘tough’, ‘personally’, ‘times’, ‘suggestions’,
'heaven’, ‘chat’, ‘would’, ‘kept’, ‘group’,
'we’, ‘ve’, ‘opinions’, ‘plan’, ‘comments’,
'everything’, ’phase’, ’managed’, ’case’, ’far’,
'group’, ’professor’, ’project’, ’idea’, ’help’,
'agreement’, ’furthermore’, ’group’, ’regarding’, ’anticipate’,
'yet,’ ’beginning,’ ’lapses,’ ’right,’ ’able’,
'podcast,’ ’meeting,’ ’everyone,’ ’task,’ ’week’,
'want,’ ’gives,’ ’decisions,’ ’since,’ ’hope’,
'team,’ ’stage,’ ’member,’ ’development,’ ’another’,
'feel,' ’group,' ’stage,' ’say,' ’make’,
'ms,' ’salazar,' ’meeting,' ’aside,' ’unresponsive’,
'everyone,' ’yet,' ’bit,' ’observe,' ’teammate’,
'teamwork,' ’team,' ’role,' ’within,' ’arise’,
'need,' ’project,' ’could,' ’honestly,' ’us’,
'principal,' ’school,’ ’us,’  ’time,’  ’officer’,
'abandoned,'  ’orphanage,’  ’locate,’  ’elderly,’  ’decided’,
'face,'  ’now,’  ‘stage,’  ‘group,’  ‘storming’,
'project,'  ‘good,’  ‘learning,’  ‘stage,’  ‘it’
]



# Read the Storming_Analysis Excel file
df = pd.read_csv('/content/Leader Reflections.csv')

# Process each row for original keywords
results = []
for index, row in df.iterrows():
    participant = f"Participant {index + 1}"
    text = row['Reflection/Experience']

    # Extract sentences with keywords
    extracted = extract_sentences_with_keywords(text, keywords)

    # Add to results
    for sentence, found_keywords in extracted.items():
        results.append({
            'Participant': participant,
            'Keywords': ', '.join(found_keywords),
            'Sentence': sentence
        })

# Process each row for additional keywords
literature_results = []
for index, row in df.iterrows():
    participant = f"Participant {index + 1}"
    text = row['Reflection/Experience']

    # Extract sentences with additional keywords
    extracted = extract_sentences_with_keywords(text, additional_keywords)

    # Add to results if keywords were found
    for sentence, found_keywords in extracted.items():
        literature_results.append({
            'Participant': participant,
            'Literature_Keywords': ', '.join(found_keywords),
            'Sentence': sentence
        })

# Create output dataframes
output_df = pd.DataFrame(results)
literature_df = pd.DataFrame(literature_results)

# Create Excel writer object
with pd.ExcelWriter('Leader_analysis_results.xlsx', engine='openpyxl') as writer:
    # Write the main analysis to the first sheet
    output_df.to_excel(writer, sheet_name='Analysis Results', index=False)

    # Write the literature keywords results to the second sheet (only if there are results)
    if not literature_df.empty:
        literature_df.to_excel(writer, sheet_name='Literature Keywords', index=False)

print("Analysis completed and saved to Excel file.")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Load the CSV file
file_path = '/content/Leader Reflections.csv'
data = pd.read_csv(file_path)

# Check columns and inspect data
print(data.columns)

# Create labels manually based on the 'Reflection/Experience' content
labels = [0 if 'leadership' in text.lower() else 1 for text in data['Reflection/Experience']]

# Split data into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(data['Reflection/Experience'].astype(str).tolist(), labels, test_size=0.2)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)

train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

# Convert the encoded data to PyTorch datasets
class LeaderDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the dataset objects
train_dataset = LeaderDataset(train_encodings, train_labels)
test_dataset = LeaderDataset(test_encodings, test_labels)

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    report_to="none"  # Disable Weights & Biases logging
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
)

# Train the model
trainer.train()

# Evaluate the model
predictions, label_ids, _ = trainer.predict(test_dataset)
predicted_labels = torch.argmax(torch.tensor(predictions), dim=1)

# Generate classification report
print(classification_report(test_labels, predicted_labels))


Index(['Sections', 'Groups', 'Leader', 'Reflection/Experience', 'Unnamed: 4',
       'Unnamed: 5'],
      dtype='object')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
10,0.649


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


              precision    recall  f1-score   support

           1       1.00      1.00      1.00         8

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8

