In [1]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
import hashlib

# Read the Parquet file and create a Pandas DataFrame
file_path = 'non_depressed.parquet'
data_frame = pd.read_parquet(file_path)

# Specify the columns to check for the value 1
columns_to_check = ['excitement', 'gratitude', 'joy', 'love', 'optimism', 'pride']

# Create a mask to filter rows where any of the specified columns have a value of 1
mask = data_frame[columns_to_check].apply(lambda row: any(row == 1), axis=1)

# Apply the mask and select rows that match the conditions
selected_rows = data_frame[mask]

# Drop rows with missing author
selected_rows = selected_rows.dropna(subset=["author"])

# Drop the rows that have author = [deleted] or text = [removed]
selected_rows = selected_rows[(selected_rows["author"] != "[deleted]") & (selected_rows["text"] != "[removed]")]

# Define a function to generate a unique ID using SHA-256 hash
def generate_unique_id(data):
    sha256_hash = hashlib.sha256()
    sha256_hash.update(data.encode('utf-8'))
    hex_digest = sha256_hash.hexdigest()
    return hex_digest

# Apply the unique ID generation function to 'author' column
selected_rows["unique_id"] = selected_rows["author"].apply(generate_unique_id)

# Drop the 'author' column
selected_rows = selected_rows.drop('author', axis=1)

selected_rows.count()
# Group by 'unique_id' and concatenate 'text' column values
concatenated_df = selected_rows.groupby('unique_id')['text'].apply(' '.join).reset_index(name='concatenated_body')

# Show the resulting DataFrame with concatenated 'text' values
print(concatenated_df)

# Print the count of unique IDs
#print("Number of unique IDs:", concatenated_df.shape[0])


                                               unique_id  \
0      000884bdc57c9b1474b85832d52d7e0cf4ea0e02a58793...   
1      00095ed1a1f295a3caf4358c8c5559ffa52cff72e62d81...   
2      0009b77b45ae261711ce288c5bb5c8575c667cdfc400b2...   
3      000dba296259c778164201b5fcac764aa2b65f4628179c...   
4      00115b3389cd8a3d16102527f23e9ec819afdef3d16666...   
...                                                  ...   
18673  ffdb574be94c53845b980a55cbb2d9ce0880c3246bdf24...   
18674  ffe21929c28b432efaa4bffc2785e42354b08aa2971d46...   
18675  ffe2acf9e7d9422eaeec6bc4e6a592283421a11f8761ff...   
18676  fff7d16c33a2dfd37eeea5dbe7a5b003b8b43d4b0b417e...   
18677  fffde19ceafc7b4eb1ee93a9907fd8b9ee9053293d6e5b...   

                                       concatenated_body  
0      I like how the Red Tie Legion chose a picture ...  
1      I used to love to mix addys and Valium 10s tho...  
2      Thanks for the advice man ! Have a great New Y...  
3      The simple ones feel elegant. The si

In [5]:
# Select and rename the 'concatenated_body' column
concatenated_df = concatenated_df[['concatenated_body']].rename(columns={'concatenated_body': 'value'})

# Create a new column 'label' with value 0
concatenated_df['label'] = 0

# Select only the 'label' and 'value' columns
final_df = concatenated_df[['label', 'value']]

In [7]:

# Write the final DataFrame to a Parquet file
output_parquet_file = 'non_depressed.parquet'
final_df.to_parquet(output_parquet_file, index=False)

In [None]:
import pandas as pd
import hashlib

# Read the Parquet file and create a Pandas DataFrame
file_path = 'depression.parquet'
data_frame = pd.read_parquet(file_path)

# Drop rows with missing author
DepressionDataset = DepressionDataset.dropna(subset=["author"])

#Drop the rows that have author = [deleted]
DepressionDataset = DepressionDataset.where(col("author") != "[deleted]")
DepressionDataset = DepressionDataset.where(col("body") != "[removed]")


def generate_unique_id(data):
    sha256_hash = hashlib.sha256()
    sha256_hash.update(data.encode('utf-8'))
    hex_digest = sha256_hash.hexdigest()
    return hex_digest

# Register the UDF to use it in Spark DataFrame
generate_unique_id_udf = udf(generate_unique_id, StringType())

# Use withColumn to create a new column 'unique_id' based on 'author' column
DepressionDataset = DepressionDataset.withColumn("unique_id", generate_unique_id_udf("author"))

#Delete unused columns
DepressionDataset = DepressionDataset.drop('author')
 #Concatenate the "body" column for rows with the same ID


# Group by 'uniqueid' and concatenate 'body' column values
concatenated_df = DepressionDataset.groupBy('unique_id').agg(concat_ws(' ', collect_list('body')).alias('concatenated_body'))

# Show the resulting DataFrame with concatenated 'body' values
concatenated_df.show()



# Print the DataFrame with the unique IDs
concatenated_df.count()

# Select and rename the 'concatenated_body' column
concatenated_df = concatenated_df[['concatenated_body']].rename(columns={'concatenated_body': 'value'})

# Create a new column 'label' with value 0
concatenated_df['label'] = 1

# Select only the 'label' and 'value' columns
final_df = concatenated_df[['label', 'value']]



In [None]:

# Write the final DataFrame to a Parquet file
output_parquet_file = 'depressed.parquet'
final_df.to_parquet(output_parquet_file, index=False)