In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib  # Import joblib for model persistence

# Step 1: Read data from Excel documents
def read_excel_data(excel_files_path):
    transcripts = []
    for filename in os.listdir(excel_files_path):
        if filename.endswith('.xlsx'):
            # Read data from Excel document
            data = pd.read_excel(os.path.join(excel_files_path, filename))
            # Extract transcript and speaker from specific columns
            for index, row in data.iterrows():
                speaker = row["speaker;unicode"]  # Adjust column name as needed
                transcript = row["transcript;unicode"]  # Adjust column name as needed
                if isinstance(transcript, str):  # Check if transcript is a string
                    transcripts.append((speaker, transcript))
    return transcripts

# Step 2: Read summaries from separate files
def read_summary_files(summary_dir):
    summaries = {}
    for filename in os.listdir(summary_dir):
        if filename.endswith('.txt'):  # Assuming summaries are stored as text files
            with open(os.path.join(summary_dir, filename), "r", encoding="utf-8") as file:
                summary = file.read().strip()  # Read the summary
                summaries[filename] = summary
    return summaries

# Step 3: Preprocess text data
def preprocess_text(text):
    # Add preprocessing steps here if needed
    return text

# Step 4: Pair input text with summaries based on a common identifier
def pair_data(transcripts, summaries):
    dataset = []
    for filename, summary in summaries.items():
        # Extract document ID or other identifier from filename
        doc_id = os.path.splitext(filename)[0]
        # Match input text with summary based on identifier
        for speaker, transcript in transcripts:
            # Check if the doc_id is in the transcript
            if doc_id in transcript:
                # If transcript is a tuple, concatenate the elements
                if isinstance(transcript, tuple):
                    input_text = " ".join(map(str, transcript))
                else:
                    input_text = transcript
                # Preprocess input text and summary
                input_text = preprocess_text(input_text)
                summary = preprocess_text(summary)
                dataset.append((input_text, summary))
    return dataset

# Step 5: Merge data
def merge_data(excel_files_path, summary_dir):
    transcripts = read_excel_data(excel_files_path)
    summaries = read_summary_files(summary_dir)
    dataset = pair_data(transcripts, summaries)
    return dataset

# Step 6: Split the dataset into training, validation, and test sets
def split_dataset(dataset, test_size=0.1, val_size=0.1):
    if len(dataset) == 0:
        raise ValueError("Dataset is empty. Cannot split an empty dataset.")
    
    train_val_set, test_set = train_test_split(dataset, test_size=test_size, random_state=42)
    train_set, val_set = train_test_split(train_val_set, test_size=val_size/(1-test_size), random_state=42)
    return train_set, val_set, test_set

# Step 7: Model training and evaluation
def train_and_evaluate_model(train_set, val_set, test_set):
    # Vectorize the text data using TF-IDF
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform([x[0] for x in train_set])
    X_val = vectorizer.transform([x[0] for x in val_set])
    X_test = vectorizer.transform([x[0] for x in test_set])
    
    # Get the labels
    y_train = [x[1] for x in train_set]
    y_val = [x[1] for x in val_set]
    y_test = [x[1] for x in test_set]
    
    # Define and train a classifier with different hyperparameters
    clf = LogisticRegression(C=1.0, penalty='l2', max_iter=1000)  # Experiment with different hyperparameters
    clf.fit(X_train, y_train)
    
    # Save the trained model
    model_path = "/kaggle/working/trained_model.pkl"
    joblib.dump(clf, model_path)
    print("Trained model saved at:", model_path)
    
    # Predictions
    y_pred_train = clf.predict(X_train)
    y_pred_val = clf.predict(X_val)
    y_pred_test = clf.predict(X_test)
    
    # Calculate accuracy
    train_accuracy = accuracy_score(y_train, y_pred_train)
    val_accuracy = accuracy_score(y_val, y_pred_val)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    
    # Print the accuracies
    print("Training Accuracy:", train_accuracy)
    print("Validation Accuracy:", val_accuracy)
    print("Test Accuracy:", test_accuracy)

def main():
    # Step 1: Read data from Excel documents
    excel_files_path = "/kaggle/input/newdataset"  # Path to Excel files
    summary_dir = "/kaggle/input/summary"  # Path to summary files

    # Step 2: Merge data
    dataset = merge_data(excel_files_path, summary_dir)
    
    print("Total number of data points:", len(dataset))

    # Step 3: Split the dataset
    try:
        train_set, val_set, test_set = split_dataset(dataset)
    except ValueError as e:
        print("Error:", e)
        return

    # Step 7: Model training and evaluation
    train_and_evaluate_model(train_set, val_set, test_set)

if __name__ == "__main__":
    main()


Total number of data points: 5
Trained model saved at: /kaggle/working/trained_model.pkl
Training Accuracy: 0.6666666666666666
Validation Accuracy: 0.0
Test Accuracy: 0.0


In [2]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 1: Read data from Excel documents
def read_excel_data(excel_files_path):
    transcripts = []
    for filename in os.listdir(excel_files_path):
        if filename.endswith('.xlsx'):
            # Read data from Excel document
            data = pd.read_excel(os.path.join(excel_files_path, filename))
            # Extract transcript and speaker from specific columns
            for index, row in data.iterrows():
                speaker = row["speaker;unicode"]  # Adjust column name as needed
                transcript = row["transcript;unicode"]  # Adjust column name as needed
                if isinstance(transcript, str):  # Check if transcript is a string
                    transcripts.append((speaker, transcript))
    return transcripts

# Step 2: Read summaries from separate files
def read_summary_files(summary_dir):
    summaries = {}
    for filename in os.listdir(summary_dir):
        if filename.endswith('.txt'):  # Assuming summaries are stored as text files
            with open(os.path.join(summary_dir, filename), "r", encoding="utf-8") as file:
                summary = file.read().strip()  # Read the summary
                summaries[filename] = summary
    return summaries

# Step 3: Preprocess text data
def preprocess_text(text):
    # Add preprocessing steps here if needed
    return text

def pair_data(transcripts, summaries):
    dataset = []
    for filename, summary in summaries.items():
        print("Processing summary:", filename)
        doc_id = os.path.splitext(filename)[0]
        print("Document ID:", doc_id)
        matched_transcripts = [(speaker, transcript) for speaker, transcript in transcripts if doc_id in transcript]
        print("Number of matched transcripts:", len(matched_transcripts))
        for speaker, transcript in matched_transcripts:
            if isinstance(transcript, tuple):
                input_text = " ".join(map(str, transcript))
            else:
                input_text = transcript
            input_text = preprocess_text(input_text)
            summary = preprocess_text(summary)
            dataset.append((input_text, summary))
    return dataset


# Step 5: Merge data
def merge_data(excel_files_path, summary_dir):
    transcripts = read_excel_data(excel_files_path)
    print("Number of transcripts read:", len(transcripts))
    summaries = read_summary_files(summary_dir)
    print("Number of summaries read:", len(summaries))
    dataset = pair_data(transcripts, summaries)
    print("Number of data points after merging:", len(dataset))
    return dataset

# Step 6: Split the dataset into training, validation, and test sets
def split_dataset(dataset, test_size=0.1, val_size=0.1):
    if len(dataset) == 0:
        raise ValueError("Dataset is empty. Cannot split an empty dataset.")
    train_val_set, test_set = train_test_split(dataset, test_size=test_size, random_state=42)
    train_set, val_set = train_test_split(train_val_set, test_size=val_size/(1-test_size), random_state=42)
    print("Number of training examples:", len(train_set))
    print("Number of validation examples:", len(val_set))
    print("Number of test examples:", len(test_set))
    return train_set, val_set, test_set

# Step 7: Model training and evaluation
def train_and_evaluate_model(train_set, val_set, test_set):
    # Vectorize the text data using TF-IDF
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform([x[0] for x in train_set])
    X_val = vectorizer.transform([x[0] for x in val_set])
    X_test = vectorizer.transform([x[0] for x in test_set])
    
    # Get the labels
    y_train = [x[1] for x in train_set]
    y_val = [x[1] for x in val_set]
    y_test = [x[1] for x in test_set]
    
    # Define and train a classifier with different hyperparameters
    clf = LogisticRegression(C=1.0, penalty='l2', max_iter=1000)  # Experiment with different hyperparameters
    clf.fit(X_train, y_train)
    
    # Predictions
    y_pred_train = clf.predict(X_train)
    y_pred_val = clf.predict(X_val)
    y_pred_test = clf.predict(X_test)
    
    # Calculate accuracy
    train_accuracy = accuracy_score(y_train, y_pred_train)
    val_accuracy = accuracy_score(y_val, y_pred_val)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    
    # Print the accuracies
    print("Training Accuracy:", train_accuracy)
    print("Validation Accuracy:", val_accuracy)
    print("Test Accuracy:", test_accuracy)

# Main function
def main():
    # Step 1: Read data from Excel documents
    excel_files_path = "/kaggle/input/newdataset"  # Path to Excel files
    summary_dir = "/kaggle/input/summary"  # Path to summary files

    # Step 2: Merge data
    dataset = merge_data(excel_files_path, summary_dir)

    # Step 3: Split the dataset
    try:
        train_set, val_set, test_set = split_dataset(dataset)
    except ValueError as e:
        print("Error:", e)
        return

    # Step 7: Model training and evaluation
    train_and_evaluate_model(train_set, val_set, test_set)

if __name__ == "__main__":
    main()


Number of transcripts read: 6229
Number of summaries read: 30
Processing summary: 3.txt
Document ID: 3
Number of matched transcripts: 0
Processing summary: 30.txt
Document ID: 30
Number of matched transcripts: 0
Processing summary: 10.txt
Document ID: 10
Number of matched transcripts: 1
Processing summary: 19.txt
Document ID: 19
Number of matched transcripts: 0
Processing summary: 5.txt
Document ID: 5
Number of matched transcripts: 1
Processing summary: 7.txt
Document ID: 7
Number of matched transcripts: 0
Processing summary: 21.txt
Document ID: 21
Number of matched transcripts: 0
Processing summary: 8.txt
Document ID: 8
Number of matched transcripts: 0
Processing summary: 18.txt
Document ID: 18
Number of matched transcripts: 0
Processing summary: 17.txt
Document ID: 17
Number of matched transcripts: 0
Processing summary: 11.txt
Document ID: 11
Number of matched transcripts: 0
Processing summary: 23.txt
Document ID: 23
Number of matched transcripts: 0
Processing summary: 9.txt
Document

In [3]:
!pip install gpt-2-simple

[0m[31mERROR: Could not find a version that satisfies the requirement gpt-2-simple (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for gpt-2-simple[0m[31m
[0m

In [4]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib  # Import joblib for model persistence
import gpt_2_simple as gpt2

# Step 1: Read data from Excel documents
def read_excel_data(excel_files_path):
    transcripts = []
    for filename in os.listdir(excel_files_path):
        if filename.endswith('.xlsx'):
            # Read data from Excel document
            data = pd.read_excel(os.path.join(excel_files_path, filename))
            # Extract transcript and speaker from specific columns
            for index, row in data.iterrows():
                speaker = row["speaker;unicode"]  # Adjust column name as needed
                transcript = row["transcript;unicode"]  # Adjust column name as needed
                if isinstance(transcript, str):  # Check if transcript is a string
                    transcripts.append((speaker, transcript))
    return transcripts

# Step 2: Read summaries from separate files
def read_summary_files(summary_dir):
    summaries = {}
    for filename in os.listdir(summary_dir):
        if filename.endswith('.txt'):  # Assuming summaries are stored as text files
            with open(os.path.join(summary_dir, filename), "r", encoding="utf-8") as file:
                summary = file.read().strip()  # Read the summary
                summaries[filename] = summary
    return summaries

# Step 3: Preprocess text data
def preprocess_text(text):
    # Add preprocessing steps here if needed
    return text

# Step 4: Pair input text with summaries based on a common identifier
def pair_data(transcripts, summaries):
    dataset = []
    for filename, summary in summaries.items():
        # Extract document ID or other identifier from filename
        doc_id = os.path.splitext(filename)[0]
        # Match input text with summary based on identifier
        for speaker, transcript in transcripts:
            # Check if the doc_id is in the transcript
            if doc_id in transcript:
                # If transcript is a tuple, concatenate the elements
                if isinstance(transcript, tuple):
                    input_text = " ".join(map(str, transcript))
                else:
                    input_text = transcript
                # Preprocess input text and summary
                input_text = preprocess_text(input_text)
                summary = preprocess_text(summary)
                dataset.append((input_text, summary))
    return dataset

# Step 5: Merge data
def merge_data(excel_files_path, summary_dir):
    transcripts = read_excel_data(excel_files_path)
    summaries = read_summary_files(summary_dir)
    dataset = pair_data(transcripts, summaries)
    return dataset

# Step 6: Split the dataset into training, validation, and test sets
def split_dataset(dataset, test_size=0.1, val_size=0.1):
    if len(dataset) == 0:
        raise ValueError("Dataset is empty. Cannot split an empty dataset.")
    
    train_val_set, test_set = train_test_split(dataset, test_size=test_size, random_state=42)
    train_set, val_set = train_test_split(train_val_set, test_size=val_size/(1-test_size), random_state=42)
    return train_set, val_set, test_set

# Step 7: Model training and evaluation
def train_and_evaluate_model(train_set, val_set, test_set):
    # Vectorize the text data using TF-IDF
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform([x[0] for x in train_set])
    X_val = vectorizer.transform([x[0] for x in val_set])
    X_test = vectorizer.transform([x[0] for x in test_set])
    
    # Get the labels
    y_train = [x[1] for x in train_set]
    y_val = [x[1] for x in val_set]
    y_test = [x[1] for x in test_set]
    
    # Define and train a classifier with different hyperparameters
    clf = LogisticRegression(C=1.0, penalty='l2', max_iter=1000)  # Experiment with different hyperparameters
    clf.fit(X_train, y_train)
    
    # Save the trained model
    model_path = "/kaggle/working/trained_model.pkl"
    joblib.dump(clf, model_path)
    print("Trained model saved at:", model_path)
    
    # Predictions
    y_pred_train = clf.predict(X_train)
    y_pred_val = clf.predict(X_val)
    y_pred_test = clf.predict(X_test)
    
    # Calculate accuracy
    train_accuracy = accuracy_score(y_train, y_pred_train)
    val_accuracy = accuracy_score(y_val, y_pred_val)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    
    # Print the accuracies
    print("Training Accuracy:", train_accuracy)
    print("Validation Accuracy:", val_accuracy)
    print("Test Accuracy:", test_accuracy)

# Step 8: Generate text summaries using GPT-2
def generate_summaries(texts):
    # Download pre-trained GPT-2 model (124M)
    gpt2.download_gpt2(model_name="124M")
    
    summaries = []
    for text in texts:
        summary = gpt2.generate(sess, return_as_list=True)[0]
        summaries.append(summary)
    return summaries

def main():
    # Step 1: Read data from Excel documents
    excel_files_path = "/kaggle/input/newdataset"  # Path to Excel files
    summary_dir = "/kaggle/input/summary"  # Path to summary files

    # Step 2: Merge data
    dataset = merge_data(excel_files_path, summary_dir)
    
    print("Total number of data points:", len(dataset))

    # Step 3: Split the dataset
    try:
        train_set, val_set, test_set = split_dataset(dataset)
    except ValueError as e:
        print("Error:", e)
        return

    # Step 7: Model training and evaluation
    train_and_evaluate_model(train_set, val_set, test_set)

    # Step 8: Generate text summaries using GPT-2
    input_texts = [x[0] for x in test_set]  # Extract input texts from test set
    gpt2_summaries = generate_summaries(input_texts)
    
    # Print GPT-2 generated summaries
    for i, summary in enumerate(gpt2_summaries):
        print("GPT-2 Summary", i+1, ":", summary)

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'gpt_2_simple'