In [None]:
# Calculating Average Sentence Length for both French and Greek texts
avg_sentence_length_fr = sum(len(sentence) for sentence in tokenized_french_filtered) / len(tokenized_french_filtered)
avg_sentence_length_el = sum(len(sentence) for sentence in tokenized_greek_filtered) / len(tokenized_greek_filtered)

avg_sentence_length_fr, avg_sentence_length_el


In [1]:
import os
import pandas as pd

def calculate_avg_sentence_length(column):
    # Tokenize the sentences and filter out empty strings
    tokenized_sentences = [sentence.split() for sentence in column if isinstance(sentence, str)]
    tokenized_filtered = [sentence for sentence in tokenized_sentences if sentence]
    
    # Calculate average sentence length
    if tokenized_filtered:
        avg_length = sum(len(sentence) for sentence in tokenized_filtered) / len(tokenized_filtered)
    else:
        avg_length = 0
    
    return avg_length

# Get the current working directory
current_directory = os.getcwd()

# List all Excel files in the directory
excel_files = [file for file in os.listdir(current_directory) if file.endswith('.xlsx') or file.endswith('.xls')]

# Process each Excel file
for excel_file in excel_files:
    file_path = os.path.join(current_directory, excel_file)
    
    # Read the Excel file
    df = pd.read_excel(file_path)
    
    # Ensure the columns exist
    if df.shape[1] < 2:
        print(f"File {excel_file} does not have the required columns.")
        continue
    
    # Calculate average sentence lengths
    avg_sentence_length_fr = calculate_avg_sentence_length(df.iloc[:, 1])
    avg_sentence_length_el = calculate_avg_sentence_length(df.iloc[:, 0])
    
    # Print the results
    print(f"Title: {excel_file}")
    print(f"Avg sentence length FR: {avg_sentence_length_fr}")
    print(f"Avg sentence length EL: {avg_sentence_length_el}")
    print()  # Print a blank line for readability between files



Title: 1921 CRAINQUEBILLE FRANCE PROTOPATSIS FR EL.xlsx
Avg sentence length FR: 13.258646063281825
Avg sentence length EL: 13.85193982581156

Title: 1923 MADAME BOVARY FLAUBERT THEOTOKIS FR EL.xlsx
Avg sentence length FR: 18.59788189987163
Avg sentence length EL: 18.581443962051775

Title: 1924 ENFER BARBUSSE UNKNOWN FR EL.xlsx
Avg sentence length FR: 15.128777124315555
Avg sentence length EL: 14.848090982940699

Title: 1925 ASSOMMOIR ZOLA KOTSIKAS FR EL.xlsx
Avg sentence length FR: 16.963426070647078
Avg sentence length EL: 17.403313190247967

Title: 1925 MADELEINE ZOLA SIMIRIOTIS FR EL.xlsx
Avg sentence length FR: 15.885828025477707
Avg sentence length EL: 14.215267802175251

Title: 1930 GRANDET BALZAC PAPALEXANDROU FR EL.xlsx
Avg sentence length FR: 18.310518934081347
Avg sentence length EL: 18.160123387549074

Title: 1934 BEL-AMI MAUPASSANT UNKNOWN FR EL.xlsx
Avg sentence length FR: 13.926184728751112
Avg sentence length EL: 13.551192327630897

Title: 1934 NUIT ZOLA UNKNOWN FR EL.x

In [2]:
#οργάνωση αποτελεσμάτων σε πινακα εξελ
import openpyxl
import os
from dotenv import load_dotenv

# Load environment variables from a .env file if used
load_dotenv()

# Retrieve the output directory from environment variables
output_directory = os.getenv('OUTPUT_DIRECTORY')

if not output_directory:
    raise ValueError("The OUTPUT_DIRECTORY environment variable is not set. Please set it.")

# Data to be organized in the Excel file
data = [
    {"Title": "1921 CRAINQUEBILLE FRANCE PROTOPATSIS FR EL.xlsx", "Avg sentence length FR": 13.258646063281825, "Avg sentence length EL": 13.85193982581156},
    {"Title": "1923 MADAME BOVARY FLAUBERT THEOTOKIS FR EL.xlsx", "Avg sentence length FR": 18.59788189987163, "Avg sentence length EL": 18.581443962051775},
    # Add other entries here...
    {"Title": "1998 MEAULNES ALAIN FOURNIER PALLANTIOU FR EL.xlsx", "Avg sentence length FR": 16.194814814814816, "Avg sentence length EL": 15.738829918538633}
]

# Create a new Excel workbook
wb = openpyxl.Workbook()
sheet = wb.active

# Set column headers
sheet["A1"] = "Title"
sheet["B1"] = "Avg sentence length FR"
sheet["C1"] = "Avg sentence length EL"

# Write data to sheet
for i, entry in enumerate(data, start=2):
    sheet[f"A{i}"] = entry["Title"]
    sheet[f"B{i}"] = entry["Avg sentence length FR"]
    sheet[f"C{i}"] = entry["Avg sentence length EL"]

# Print the results for verification
for row in sheet.iter_rows(values_only=True):
    print(row)

# Save the workbook to the output directory
output_file = os.path.join(output_directory, "Avg_Sentence_Length.xlsx")
wb.save(output_file)

print(f"Data has been saved to {output_file}")


('Title', 'Avg sentence length FR', 'Avg sentence length EL')
('1921 CRAINQUEBILLE FRANCE PROTOPATSIS FR EL.xlsx', 13.258646063281825, 13.85193982581156)
('1923 MADAME BOVARY FLAUBERT THEOTOKIS FR EL.xlsx', 18.59788189987163, 18.581443962051775)
('1924 ENFER BARBUSSE UNKNOWN FR EL.xlsx', 15.128777124315555, 14.848090982940699)
('1925 ASSOMMOIR ZOLA KOTSIKAS FR EL.xlsx', 16.963426070647078, 17.403313190247967)
('1925 MADELEINE ZOLA SIMIRIOTIS FR EL.xlsx', 15.885828025477707, 14.215267802175251)
('1930 GRANDET BALZAC PAPALEXANDROU FR EL.xlsx', 18.310518934081347, 18.160123387549074)
('1934 BEL-AMI MAUPASSANT UNKNOWN FR EL.xlsx', 13.926184728751112, 13.551192327630897)
('1934 NUIT ZOLA UNKNOWN FR EL.xlsx', 15.147521160822249, 16.141190198366395)
('1935 GIDE PORTE GAZI FR EL.xlsx', 17.102895322939865, 14.509081983308787)
('1940 IMMORALISTE GIDE CHOURMOUZIOS FR EL.xlsx', 17.073245614035088, 16.650131694468833)
('1940 MOURET ZOLA PIKROS FR EL.xlsx', 15.100408788962698, 15.36193980812196)
('1

In [3]:
#προσθήκη στήλης με σύγκριση και μέτρηση συμβόλων
import openpyxl
import os
from dotenv import load_dotenv

# Load environment variables from a .env file (optional)
load_dotenv()

# Retrieve file paths from environment variables
input_file_path = os.getenv('INPUT_FILE_PATH')
output_file_path = os.getenv('OUTPUT_FILE_PATH')

if not input_file_path or not output_file_path:
    raise ValueError("The INPUT_FILE_PATH and OUTPUT_FILE_PATH environment variables are not set. Please set them.")

# Load the existing workbook
wb = openpyxl.load_workbook(input_file_path)
sheet = wb.active

# Add a new column header for comparison
sheet["D1"] = "Comparison"

# Initialize counters
greater_than_count = 0
less_than_count = 0
equal_count = 0

# Iterate through the rows and compare the values
for row in range(2, sheet.max_row + 1):
    avg_fr = sheet[f"B{row}"].value
    avg_el = sheet[f"C{row}"].value
    if avg_fr > avg_el:
        comparison = ">"
        greater_than_count += 1
    elif avg_fr < avg_el:
        comparison = "<"
        less_than_count += 1
    else:
        comparison = "="
        equal_count += 1
    sheet[f"D{row}"] = comparison

# Save the new workbook
wb.save(output_file_path)

# Print the counts
print(f"Number of '>': {greater_than_count}")
print(f"Number of '<': {less_than_count}")
print(f"Number of '=': {equal_count}")

print(f"Comparison data has been saved to {output_file_path}")


Number of '>': 24
Number of '<': 10
Number of '=': 0
Comparison data has been saved to D:\ΔΙΔΑΚΤΟΡΙΚΗ ΠΡΟΤΑΣΗ\ΣΩΜΑ ΛΟΓΟΤΕΧΝΙΚΩΝ ΚΕΙΜΕΝΩΝ\ΠΑΡΑΛΛΗΛΟ ΗΣΚ ΑΝΑΦΟΡΑΣ\MARIA CORPUS DRIVE\EXCEL RETRIEVED\Outputs_FINAL\Avg Sentence Length Comparison.xlsx


In [6]:
#εμφάνιση segments με μεγαλύτερη διαφορά μήκους 
import pandas as pd
import spacy

# Load SpaCy models for French and Greek
nlp_fr = spacy.load("fr_core_news_lg")
nlp_el = spacy.load("el_core_news_lg")

def sentence_length(text, nlp):
    if not isinstance(text, str):  # Check if the input is not a string
        return None  # Skip non-string inputs
    doc = nlp(text)
    total_words = sum(1 for token in doc if token.is_alpha)
    return total_words

# Load the data from the Excel file
file_path = '1975 VOLEUR GENET DIMITRIADIS FR EL.xlsx'
data = pd.read_excel(file_path)

# Apply the sentence length calculations to the 'FR' and 'EL' columns
data['Sentence_Length_FR'] = data['FR'].dropna().apply(lambda x: sentence_length(x, nlp_fr) if isinstance(x, str) else None)
data['Sentence_Length_EL'] = data['EL'].dropna().apply(lambda x: sentence_length(x, nlp_el) if isinstance(x, str) else None)

# Remove rows where sentence length could not be calculated (i.e., None values)
data = data.dropna(subset=['Sentence_Length_FR', 'Sentence_Length_EL'])

# Calculate the absolute difference in sentence lengths
data['Length_Difference'] = abs(data['Sentence_Length_FR'] - data['Sentence_Length_EL'])

# Sort by the largest differences in sentence length and display the top entries
sorted_length_differences = data.sort_values(by='Length_Difference', ascending=False)
top_entries = sorted_length_differences[['FR', 'EL', 'Sentence_Length_FR', 'Sentence_Length_EL', 'Length_Difference']].head(100)

# Optionally, print or save the top entries to a new Excel file
print(top_entries)
# top_entries.to_excel('sentence_length_differences.xlsx')


                                                     FR  \
4512  Sa pudeur —j'ai dit comme elle ornait sa viole...   
2324  Quand plus tard, sans refuser d’être boulevers...   
4829  Quand j’eus quitté, comme je l’ai dit plus hau...   
1883  Peut-être obscurément redoutait-il qu’en ma po...   
2639  Cette machine à donner des coups de poings, de...   
...                                                 ...   
4789  A la complicité qui nous unit, s'ajoute un acc...   
1056  Je savais que ma place était au milieu d’elles...   
320   Il s’accomplira vraiment au coeur des ténèbres...   
1895  Sur-le-champ, au moment que j’écrivais, peut-ê...   
3876  Qu’on la gagne par une discipline mathématique...   

                                                     EL  Sentence_Length_FR  \
4512  Η ντροπαλότητά του (έχω πει τι κόσμημα ήταν αυ...               192.0   
2324  Όταν θα εφαρμόσω αργότερα την ίδια τακτική, τη...                53.0   
4829  Όταν, όπως είπα και πιο πάνω, έφυγα απ’ το Βέλ..

In [4]:
#Εξαγωγή 200 segments με ακραίες τιμές. 
import pandas as pd
import spacy
import os
from dotenv import load_dotenv

# Load environment variables from a .env file (optional)
load_dotenv()

# Load SpaCy models for French and Greek
nlp_fr = spacy.load("fr_core_news_lg")
nlp_el = spacy.load("el_core_news_lg")

def sentence_length(text, nlp):
    if not isinstance(text, str):  # Check if the input is not a string
        return None  # Skip non-string inputs
    doc = nlp(text)
    total_words = sum(1 for token in doc if token.is_alpha)
    return total_words

# Retrieve file paths from environment variables
input_file_path = os.getenv('INPUT_FILE_PATH')
output_file_path = os.getenv('OUTPUT_FILE_PATH')

if not input_file_path or not output_file_path:
    raise ValueError("The INPUT_FILE_PATH and OUTPUT_FILE_PATH environment variables are not set. Please set them.")

# Load the data from the Excel file
data = pd.read_excel(input_file_path)

# Apply the sentence length calculations to the 'FR' and 'EL' columns
data['Sentence_Length_FR'] = data['FR'].apply(lambda x: sentence_length(x, nlp_fr) if isinstance(x, str) else None)
data['Sentence_Length_EL'] = data['EL'].apply(lambda x: sentence_length(x, nlp_el) if isinstance(x, str) else None)

# Remove rows where sentence length could not be calculated (i.e., None values)
data = data.dropna(subset=['Sentence_Length_FR', 'Sentence_Length_EL'])

# Calculate the absolute difference in sentence lengths
data['Length_Difference'] = abs(data['Sentence_Length_FR'] - data['Sentence_Length_EL'])

# Sort by the differences in sentence length and display the top entries
sorted_length_differences = data.sort_values(by='Length_Difference', ascending=False)
top_entries = sorted_length_differences[['FR', 'EL', 'Sentence_Length_FR', 'Sentence_Length_EL', 'Length_Difference']].head(200)

# Check if the output directory exists
output_directory = os.path.dirname(output_file_path)
if not os.path.exists(output_directory):
    print(f"Output directory does not exist: {output_directory}")
else:
    # Save the file
    top_entries.to_excel(output_file_path, index=False)
    print(f"Results saved to {output_file_path}")

    # Verify if the file was saved correctly
    if os.path.exists(output_file_path):
        print(f"File successfully saved: {output_file_path}")
    else:
        print(f"File was not saved. Please check the directory permissions and path.")


Results saved to D:\ΔΙΔΑΚΤΟΡΙΚΗ ΠΡΟΤΑΣΗ\ΣΩΜΑ ΛΟΓΟΤΕΧΝΙΚΩΝ ΚΕΙΜΕΝΩΝ\ΠΑΡΑΛΛΗΛΟ ΗΣΚ ΑΝΑΦΟΡΑΣ\MARIA CORPUS DRIVE\EXCEL RETRIEVED\Outputs\avg_sent_length_differences_VOLEUR.xlsx
File successfully saved: D:\ΔΙΔΑΚΤΟΡΙΚΗ ΠΡΟΤΑΣΗ\ΣΩΜΑ ΛΟΓΟΤΕΧΝΙΚΩΝ ΚΕΙΜΕΝΩΝ\ΠΑΡΑΛΛΗΛΟ ΗΣΚ ΑΝΑΦΟΡΑΣ\MARIA CORPUS DRIVE\EXCEL RETRIEVED\Outputs\avg_sent_length_differences_VOLEUR.xlsx


In [4]:

import pandas as pd
import spacy
import os
from dotenv import load_dotenv

# Load environment variables from a .env file
load_dotenv()

# Load SpaCy models for French and Greek
nlp_fr = spacy.load("fr_core_news_lg")
nlp_el = spacy.load("el_core_news_lg")

def sentence_length(text, nlp):
    if not isinstance(text, str):  # Check if the input is not a string
        return None  # Skip non-string inputs
    doc = nlp(text)
    total_words = sum(1 for token in doc if token.is_alpha)
    return total_words

# Retrieve file paths from environment variables
input_file_path = os.getenv('INPUT_FILE_PATH')
output_directory = os.getenv('OUTPUT_DIRECTORY')
output_file_name = os.getenv('OUTPUT_FILE_NAME')

# Ensure all necessary environment variables are set
if not input_file_path or not output_directory or not output_file_name:
    raise ValueError("Environment variables for file paths are not set. Please set INPUT_FILE_PATH, OUTPUT_DIRECTORY, and OUTPUT_FILE_NAME.")

# Load the data from the Excel file
data = pd.read_excel(input_file_path)

# Apply the sentence length calculations to the 'FR' and 'EL' columns
data['Sentence_Length_FR'] = data['FR'].apply(lambda x: sentence_length(x, nlp_fr) if isinstance(x, str) else None)
data['Sentence_Length_EL'] = data['EL'].apply(lambda x: sentence_length(x, nlp_el) if isinstance(x, str) else None)

# Remove rows where sentence length could not be calculated (i.e., None values)
data = data.dropna(subset=['Sentence_Length_FR', 'Sentence_Length_EL'])

# Calculate the absolute difference in sentence lengths
data['Length_Difference'] = abs(data['Sentence_Length_FR'] - data['Sentence_Length_EL'])

# Sort by the differences in sentence length and display the top entries
sorted_length_differences = data.sort_values(by='Length_Difference', ascending=False)
top_entries = sorted_length_differences[['FR', 'EL', 'Sentence_Length_FR', 'Sentence_Length_EL', 'Length_Difference']].head(200)

# Check if the output directory exists, and create the full output path
output_file_path = os.path.join(output_directory, output_file_name)
if not os.path.exists(output_directory):
    print(f"Output directory does not exist: {output_directory}")
else:
    # Save the file
    top_entries.to_excel(output_file_path, index=False)
    print(f"Results saved to {output_file_path}")

    # Verify if the file was saved correctly
    if os.path.exists(output_file_path):
        print(f"File successfully saved: {output_file_path}")
    else:
        print(f"File was not saved. Please check the directory permissions and path.")


Results saved to D:\ΔΙΔΑΚΤΟΡΙΚΗ ΠΡΟΤΑΣΗ\ΣΩΜΑ ΛΟΓΟΤΕΧΝΙΚΩΝ ΚΕΙΜΕΝΩΝ\ΠΑΡΑΛΛΗΛΟ ΗΣΚ ΑΝΑΦΟΡΑΣ\MARIA CORPUS DRIVE\EXCEL RETRIEVED\porte_sent_length\avg_sent_length_differences_PORTE.xlsx
File successfully saved: D:\ΔΙΔΑΚΤΟΡΙΚΗ ΠΡΟΤΑΣΗ\ΣΩΜΑ ΛΟΓΟΤΕΧΝΙΚΩΝ ΚΕΙΜΕΝΩΝ\ΠΑΡΑΛΛΗΛΟ ΗΣΚ ΑΝΑΦΟΡΑΣ\MARIA CORPUS DRIVE\EXCEL RETRIEVED\porte_sent_length\avg_sent_length_differences_PORTE.xlsx


In [9]:
#βγαίνουν καλά, αργεί αρκετά να ολοκληρωθεί η εκτέλεση. 
import os
import glob
import pandas as pd
import spacy
from dotenv import load_dotenv

# Load environment variables from a .env file
load_dotenv()

# Load SpaCy models for French and Greek
nlp_fr = spacy.load("fr_core_news_sm")
nlp_el = spacy.load("el_core_news_sm")

def sentence_length(text, nlp):
    if not isinstance(text, str):  # Check if the input is not a string
        return None  # Skip non-string inputs
    doc = nlp(text)
    total_words = sum(1 for token in doc if token.is_alpha)
    return total_words

def process_file(file_path):
    # Load the data from the Excel file
    data = pd.read_excel(file_path)
    
    # Check if the file has at least two columns
    if data.shape[1] < 2:
        print(f"File {file_path} does not have enough columns.")
        return pd.DataFrame()  # Return an empty DataFrame
    
    # Extract the relevant columns (first and second columns)
    el_texts = data.iloc[:, 0]  # Assuming Greek text is in the first column
    fr_texts = data.iloc[:, 1]  # Assuming French text is in the second column

    # Apply the sentence length calculations to the Greek and French columns
    data['Sentence_Length_EL'] = el_texts.apply(lambda x: sentence_length(x, nlp_el) if isinstance(x, str) else None)
    data['Sentence_Length_FR'] = fr_texts.apply(lambda x: sentence_length(x, nlp_fr) if isinstance(x, str) else None)
    
    # Remove rows where sentence length could not be calculated (i.e., None values)
    data = data.dropna(subset=['Sentence_Length_FR', 'Sentence_Length_EL'])
    
    # Calculate the absolute difference in sentence lengths
    data['Length_Difference'] = abs(data['Sentence_Length_FR'] - data['Sentence_Length_EL'])
    
    # Add the file title
    data['Title'] = os.path.basename(file_path)
    
    return data

# Retrieve input and output directories from environment variables
input_directory = os.getenv('INPUT_DIRECTORY')
output_directory = os.getenv('OUTPUT_DIRECTORY')

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Process all Excel files in the input directory
all_data = pd.DataFrame()
for file_path in glob.glob(os.path.join(input_directory, '*.xlsx')):
    file_data = process_file(file_path)
    all_data = pd.concat([all_data, file_data], ignore_index=True)

# Sort by the differences in sentence length and get the top 200 entries
sorted_data = all_data.sort_values(by='Length_Difference', ascending=False)
top_entries = sorted_data.head(200)

# Retrieve output file name from environment variable
output_file_name = os.getenv('OUTPUT_FILE_NAME')
output_file_path = os.path.join(output_directory, output_file_name)
top_entries.to_excel(output_file_path, index=False)

print(f"Top 200 entries saved to {output_file_path}")


Top 200 entries saved to D:\ΔΙΔΑΚΤΟΡΙΚΗ ΠΡΟΤΑΣΗ\ΣΩΜΑ ΛΟΓΟΤΕΧΝΙΚΩΝ ΚΕΙΜΕΝΩΝ\ΠΑΡΑΛΛΗΛΟ ΗΣΚ ΑΝΑΦΟΡΑΣ\MARIA CORPUS DRIVE\EXCEL RETRIEVED\Outputs_FINAL\Top_200_Avg_Sentence_Length_Differences.xlsx
